On 11/4/20 2:05 AM, Tomas Vondra wrote:
Hi,
As pointed out in [1], BRIN is not properly handling toasted data, which
may easily lead to index tuples referencing TOAST-ed values. Which is
clearly wrong - it's trivial to trigger failues after a DELETE.
Attached is a patch that aims to fix this - AFAIK the brin_form_tuple
was simply missing the TOAST_INDEX_HACK stuff from index_form_tuple,
which ensures the data is detoasted and (possibly) re-compressed. The
code is mostly the same, with some BRIN-specific tweaks (looking at
oi_typecache instead of the index descriptor, etc.).
I also attach a simple SQL script that I used to trigger the issue. This
needs to be turned into a regression test, I'll work on that tomorrow.
OK, so here's an improved version of the fix - aside from the code (same
as in v1), there are two patches with regression tests. Ultimately those
should be merged with the fix, but this way it's possible to apply the
regression tests to trigger the issue.
The first test is fairly trivial - it simply builds index on toasted
data and then shows how an insert and select fail. There's a caveat,
that this requires a DELETE + VACUUM, and the VACUUM actually has to
cleanup the rows. So there must be no concurrent transactions that might
need the rows, which is unlikely in regression tests. So this requires
waiting for all running transactions to finish - I did that by building
an index concurrently. It's a bit strange, but it's better than any
other solution I could think of (timeout or some custom wait for xacts).
The second test is a bit redundant - it merely checks that both CREATE
INDEX and INSERT INTO fail the same way when the index tuple gets too
large. Before the fix there were some inconsistencies - the CREATE INDEX
succeeded because it used TOASTed data. So ultimately this tests the
same thing, but from a different perspective.
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
>From aacea18d3f5f22e12ed49e3487fdfa46ac8d4c18 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <to...@2ndquadrant.com>
Date: Wed, 4 Nov 2020 18:09:44 +0100
Subject: [PATCH 1/3] detoast data for BRIN indexes
---
src/backend/access/brin/brin_tuple.c | 96 +++++++++++++++++++++++++++-
1 file changed, 95 insertions(+), 1 deletion(-)
diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c
index 46e6b23c87..96eab9e87c 100644
--- a/src/backend/access/brin/brin_tuple.c
+++ b/src/backend/access/brin/brin_tuple.c
@@ -38,6 +38,17 @@
#include "utils/datum.h"
#include "utils/memutils.h"
+#include "access/detoast.h"
+#include "access/heaptoast.h"
+#include "access/toast_internals.h"
+
+/*
+ * This enables de-toasting of index entries. Needed until VACUUM is
+ * smart enough to rebuild indexes from scratch.
+ */
+#define TOAST_INDEX_HACK
+
+
static inline void brin_deconstruct_tuple(BrinDesc *brdesc,
char *tp, bits8 *nullbits, bool nulls,
Datum *values, bool *allnulls, bool *hasnulls);
@@ -99,6 +110,12 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
Size len,
hoff,
data_len;
+ int i;
+
+#ifdef TOAST_INDEX_HACK
+ Datum *untoasted_values;
+ int nuntoasted = 0;
+#endif
Assert(brdesc->bd_totalstored > 0);
@@ -107,6 +124,10 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
phony_nullbitmap = (bits8 *)
palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored));
+#ifdef TOAST_INDEX_HACK
+ untoasted_values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored);
+#endif
+
/*
* Set up the values/nulls arrays for heap_fill_tuple
*/
@@ -141,7 +162,75 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
for (datumno = 0;
datumno < brdesc->bd_info[keyno]->oi_nstored;
datumno++)
- values[idxattno++] = tuple->bt_columns[keyno].bv_values[datumno];
+ {
+ Datum value = tuple->bt_columns[keyno].bv_values[datumno];
+
+#ifdef TOAST_INDEX_HACK
+
+ /* We must look at the stored type, not at the index descriptor. */
+ TypeCacheEntry *atttype = brdesc->bd_info[keyno]->oi_typcache[datumno];
+
+ /* Do we need to free the value at the end? */
+ bool free_value = false;
+
+ /* For non-varlena types we don't need to do anything special */
+ if (atttype->typlen != -1)
+ {
+ values[idxattno++] = value;
+ continue;
+ }
+
+ /*
+ * Do nothing if value is not of varlena type. We don't need to
+ * care about NULL values here, thanks to bv_allnulls above.
+ *
+ * If value is stored EXTERNAL, must fetch it so we are not
+ * depending on outside storage.
+ *
+ * XXX Is this actually true? Could it be that the summary is
+ * NULL even for range with non-NULL data? E.g. degenerate bloom
+ * filter may be thrown away, etc.
+ */
+ if (VARATT_IS_EXTERNAL(DatumGetPointer(value)))
+ {
+ value = PointerGetDatum(detoast_external_attr((struct varlena *)
+ DatumGetPointer(value)));
+ free_value = true;
+ }
+
+ /*
+ * If value is above size target, and is of a compressible datatype,
+ * try to compress it in-line.
+ */
+ if (!VARATT_IS_EXTENDED(DatumGetPointer(value)) &&
+ VARSIZE(DatumGetPointer(value)) > TOAST_INDEX_TARGET &&
+ (atttype->typstorage == TYPSTORAGE_EXTENDED ||
+ atttype->typstorage == TYPSTORAGE_MAIN))
+ {
+ Datum cvalue = toast_compress_datum(value);
+
+ if (DatumGetPointer(cvalue) != NULL)
+ {
+ /* successful compression */
+ if (free_value)
+ pfree(DatumGetPointer(value));
+
+ value = cvalue;
+ free_value = true;
+ }
+ }
+
+ /*
+ * If we untoasted / compressed the value, we need to free it
+ * after forming the index tuple.
+ */
+ if (free_value)
+ untoasted_values[nuntoasted++] = value;
+
+#endif
+
+ values[idxattno++] = value;
+ }
}
/* Assert we did not overrun temp arrays */
@@ -193,6 +282,11 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
pfree(nulls);
pfree(phony_nullbitmap);
+#ifdef TOAST_INDEX_HACK
+ for (i = 0; i < nuntoasted; i++)
+ pfree(DatumGetPointer(untoasted_values[i]));
+#endif
+
/*
* Now fill in the real null bitmasks. allnulls first.
*/
--
2.25.4
>From 32caeb5e2965e54a48446a97664501c04a733659 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <to...@2ndquadrant.com>
Date: Wed, 4 Nov 2020 18:28:45 +0100
Subject: [PATCH 2/3] brin regression test: detoast values
---
src/test/regress/expected/brin.out | 41 ++++++++++++++++++++++++++++++
src/test/regress/sql/brin.sql | 39 ++++++++++++++++++++++++++++
2 files changed, 80 insertions(+)
diff --git a/src/test/regress/expected/brin.out b/src/test/regress/expected/brin.out
index 18403498df..e53d6e4885 100644
--- a/src/test/regress/expected/brin.out
+++ b/src/test/regress/expected/brin.out
@@ -526,3 +526,44 @@ EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1;
Filter: (b = 1)
(2 rows)
+-- make sure data are properly de-toasted in BRIN index
+CREATE TABLE brintest_3 (a text, b text, c text, d text);
+-- long random strings (~2000 chars each, so ~6kB for min/max on two
+-- columns) to trigger toasting
+WITH rand_value AS (SELECT string_agg(md5(i::text),'') AS val FROM generate_series(1,60) s(i))
+INSERT INTO brintest_3
+SELECT val, val, val, val FROM rand_value;
+CREATE INDEX brin_test_toast_idx ON brintest_3 USING brin (b, c);
+DELETE FROM brintest_3;
+-- We need to wait a bit for all transactions to complete, so that the
+-- vacuum actually removes the TOAST rows. Creating an index concurrently
+-- is a one way to achieve that, because it does exactly such wait.
+CREATE INDEX CONCURRENTLY brin_test_temp_idx ON brintest_3(a);
+DROP INDEX brin_test_temp_idx;
+-- vacuum the table, to discard TOAST data
+VACUUM brintest_3;
+-- retry insert with a different random-looking (but deterministic) value
+-- the value is different, and so should replace either min or max in the
+-- brin summary
+WITH rand_value AS (SELECT string_agg(md5((-i)::text),'') AS val FROM generate_series(1,60) s(i))
+INSERT INTO brintest_3
+SELECT val, val, val, val FROM rand_value;
+-- now try some queries, accessing the brin index
+SET enable_seqscan = off;
+EXPLAIN (COSTS OFF)
+SELECT * FROM brintest_3 WHERE b < '0';
+ QUERY PLAN
+------------------------------------------------
+ Bitmap Heap Scan on brintest_3
+ Recheck Cond: (b < '0'::text)
+ -> Bitmap Index Scan on brin_test_toast_idx
+ Index Cond: (b < '0'::text)
+(4 rows)
+
+SELECT * FROM brintest_3 WHERE b < '0';
+ a | b | c | d
+---+---+---+---
+(0 rows)
+
+DROP TABLE brintest_3;
+RESET enable_seqscan;
diff --git a/src/test/regress/sql/brin.sql b/src/test/regress/sql/brin.sql
index d1a82474f3..3bd866d947 100644
--- a/src/test/regress/sql/brin.sql
+++ b/src/test/regress/sql/brin.sql
@@ -470,3 +470,42 @@ VACUUM ANALYZE brin_test;
EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE a = 1;
-- Ensure brin index is not used when values are not correlated
EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1;
+
+-- make sure data are properly de-toasted in BRIN index
+CREATE TABLE brintest_3 (a text, b text, c text, d text);
+
+-- long random strings (~2000 chars each, so ~6kB for min/max on two
+-- columns) to trigger toasting
+WITH rand_value AS (SELECT string_agg(md5(i::text),'') AS val FROM generate_series(1,60) s(i))
+INSERT INTO brintest_3
+SELECT val, val, val, val FROM rand_value;
+
+CREATE INDEX brin_test_toast_idx ON brintest_3 USING brin (b, c);
+DELETE FROM brintest_3;
+
+-- We need to wait a bit for all transactions to complete, so that the
+-- vacuum actually removes the TOAST rows. Creating an index concurrently
+-- is a one way to achieve that, because it does exactly such wait.
+CREATE INDEX CONCURRENTLY brin_test_temp_idx ON brintest_3(a);
+DROP INDEX brin_test_temp_idx;
+
+-- vacuum the table, to discard TOAST data
+VACUUM brintest_3;
+
+-- retry insert with a different random-looking (but deterministic) value
+-- the value is different, and so should replace either min or max in the
+-- brin summary
+WITH rand_value AS (SELECT string_agg(md5((-i)::text),'') AS val FROM generate_series(1,60) s(i))
+INSERT INTO brintest_3
+SELECT val, val, val, val FROM rand_value;
+
+-- now try some queries, accessing the brin index
+SET enable_seqscan = off;
+
+EXPLAIN (COSTS OFF)
+SELECT * FROM brintest_3 WHERE b < '0';
+
+SELECT * FROM brintest_3 WHERE b < '0';
+
+DROP TABLE brintest_3;
+RESET enable_seqscan;
--
2.25.4
>From 2fa6ad38c6d7337981047db34b28531516701bfd Mon Sep 17 00:00:00 2001
From: Tomas Vondra <to...@2ndquadrant.com>
Date: Wed, 4 Nov 2020 20:10:10 +0100
Subject: [PATCH 3/3] brin regression test: index tuple size check
---
src/test/regress/expected/brin.out | 18 ++++++++++++++++++
src/test/regress/sql/brin.sql | 24 ++++++++++++++++++++++++
2 files changed, 42 insertions(+)
diff --git a/src/test/regress/expected/brin.out b/src/test/regress/expected/brin.out
index e53d6e4885..79ec028f69 100644
--- a/src/test/regress/expected/brin.out
+++ b/src/test/regress/expected/brin.out
@@ -567,3 +567,21 @@ SELECT * FROM brintest_3 WHERE b < '0';
DROP TABLE brintest_3;
RESET enable_seqscan;
+-- make sure we're enforcing the index tuple size consistently
+CREATE TABLE brintest_4 (a text, b text, c text, d text);
+-- long random strings (~2400 chars each, so ~10kB for min/max on two
+-- columns) to trigger toasting
+WITH rand_value AS (SELECT string_agg(md5(i::text),'') AS val FROM generate_series(1,75) s(i))
+INSERT INTO brintest_4
+SELECT val, val, val, val FROM rand_value;
+-- should fail, because of index tuple being too large
+CREATE INDEX brin_test_toast_idx ON brintest_4 USING brin (b, c);
+ERROR: index row size 9624 exceeds maximum 8152 for index "brin_test_toast_idx"
+-- now try insert with an existing index
+TRUNCATE brintest_4;
+CREATE INDEX brin_test_toast_idx ON brintest_4 USING brin (b, c);
+WITH rand_value AS (SELECT string_agg(md5((-i)::text),'') AS val FROM generate_series(1,75) s(i))
+INSERT INTO brintest_4
+SELECT val, val, val, val FROM rand_value;
+ERROR: index row size 9624 exceeds maximum 8152 for index "brin_test_toast_idx"
+DROP TABLE brintest_4;
diff --git a/src/test/regress/sql/brin.sql b/src/test/regress/sql/brin.sql
index 3bd866d947..f1b510fb0b 100644
--- a/src/test/regress/sql/brin.sql
+++ b/src/test/regress/sql/brin.sql
@@ -509,3 +509,27 @@ SELECT * FROM brintest_3 WHERE b < '0';
DROP TABLE brintest_3;
RESET enable_seqscan;
+
+
+-- make sure we're enforcing the index tuple size consistently
+CREATE TABLE brintest_4 (a text, b text, c text, d text);
+
+-- long random strings (~2400 chars each, so ~10kB for min/max on two
+-- columns) to trigger toasting
+WITH rand_value AS (SELECT string_agg(md5(i::text),'') AS val FROM generate_series(1,75) s(i))
+INSERT INTO brintest_4
+SELECT val, val, val, val FROM rand_value;
+
+-- should fail, because of index tuple being too large
+CREATE INDEX brin_test_toast_idx ON brintest_4 USING brin (b, c);
+
+-- now try insert with an existing index
+TRUNCATE brintest_4;
+
+CREATE INDEX brin_test_toast_idx ON brintest_4 USING brin (b, c);
+
+WITH rand_value AS (SELECT string_agg(md5((-i)::text),'') AS val FROM generate_series(1,75) s(i))
+INSERT INTO brintest_4
+SELECT val, val, val, val FROM rand_value;
+
+DROP TABLE brintest_4;
--
2.25.4