On 11/6/19 11:58 AM, Tomas Vondra wrote:
On Wed, Nov 06, 2019 at 08:54:40PM +0100, Tomas Vondra wrote:
On Mon, Oct 28, 2019 at 04:20:48PM +0100, Tomas Vondra wrote:
Hi,
PostgreSQL 10 introduced extended statistics, allowing us to consider
correlation between columns to improve estimates, and PostgreSQL 12
added support for MCV statistics. But we still had the limitation that
we only allowed using a single extended statistics per relation, i.e.
given a table with two extended stats
CREATE TABLE t (a int, b int, c int, d int);
CREATE STATISTICS s1 (mcv) ON a, b FROM t;
CREATE STATISTICS s2 (mcv) ON c, d FROM t;
and a query
SELECT * FROM t WHERE a = 1 AND b = 1 AND c = 1 AND d = 1;
we only ever used one of the statistics (and we considered them in a not
particularly well determined order).
This patch addresses this by using as many extended stats as possible,
by adding a loop to statext_mcv_clauselist_selectivity(). In each step
we pick the "best" applicable statistics (in the sense of covering the
most attributes) and factor it into the oveall estimate.
Tomas,
Your patch compiles and passes the regression tests for me on debian
linux under master.
Since your patch does not include modified regression tests, I wrote a
test that I expected to improve under this new code, but running it both
before and after applying your patch, there is no change. Please find
the modified test attached. Am I wrong to expect some change in this
test's output? If so, can you provide a test example that works
differently under your patch?
Thanks!
--
Mark Dilger
diff --git a/src/test/regress/expected/stats_ext.out
b/src/test/regress/expected/stats_ext.out
index b65228fa07..8e20df25fc 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -353,6 +353,298 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM
ndistinct GROUP BY a, d
500 | 50
(1 row)
+-- muliple extended statistics
+CREATE TABLE multistats (
+ i2 INTEGER,
+ i3 INTEGER,
+ i17 INTEGER,
+ i19 INTEGER,
+ i34 INTEGER,
+ i38 INTEGER,
+ i51 INTEGER,
+ i57 INTEGER
+);
+INSERT INTO multistats (i2, i3, i17, i19, i34, i38, i51, i57)
+ SELECT i % 2, i % 3, i % 17, i % 19, i % 34, i % 38, i % 51, i % 57
+ FROM generate_series(1, 500000) s(i);
+-- estimates without mcv statistics
+ANALYZE multistats;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i2, i34');
+ estimated | actual
+-----------+--------
+ 68 | 34
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i2, i38');
+ estimated | actual
+-----------+--------
+ 76 | 38
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i34');
+ estimated | actual
+-----------+--------
+ 578 | 34
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i19, i38');
+ estimated | actual
+-----------+--------
+ 722 | 38
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i19, i34, i38');
+ estimated | actual
+-----------+--------
+ 50000 | 646
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i3, i51');
+ estimated | actual
+-----------+--------
+ 153 | 51
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i3, i57');
+ estimated | actual
+-----------+--------
+ 171 | 57
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i51');
+ estimated | actual
+-----------+--------
+ 867 | 51
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i19, i57');
+ estimated | actual
+-----------+--------
+ 1083 | 57
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i19, i51, i57');
+ estimated | actual
+-----------+--------
+ 50000 | 969
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i2 = i34');
+ estimated | actual
+-----------+--------
+ 2500 | 29411
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i2 = i38');
+ estimated | actual
+-----------+--------
+ 2500 | 26315
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i34');
+ estimated | actual
+-----------+--------
+ 2500 | 250001
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i19 = i38');
+ estimated | actual
+-----------+--------
+ 2500 | 250001
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i34
AND i19 = i38');
+ estimated | actual
+-----------+--------
+ 12 | 125387
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i34
OR i19 = i38');
+ estimated | actual
+-----------+--------
+ 4988 | 374615
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i3 = i51');
+ estimated | actual
+-----------+--------
+ 2500 | 29411
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i3 = i57');
+ estimated | actual
+-----------+--------
+ 2500 | 26315
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i51');
+ estimated | actual
+-----------+--------
+ 2500 | 166667
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i19 = i57');
+ estimated | actual
+-----------+--------
+ 2500 | 166667
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i51
AND i19 = i57');
+ estimated | actual
+-----------+--------
+ 12 | 55727
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i51
OR i19 = i57');
+ estimated | actual
+-----------+--------
+ 4988 | 277607
+(1 row)
+
+-- create mcv statistics over pairs of strongly correlated columns
+CREATE STATISTICS ms_2_34 (mcv) ON i2, i34 FROM multistats;
+CREATE STATISTICS ms_2_38 (mcv) ON i2, i38 FROM multistats;
+CREATE STATISTICS ms_3_51 (mcv) ON i3, i51 FROM multistats;
+CREATE STATISTICS ms_3_57 (mcv) ON i3, i57 FROM multistats;
+CREATE STATISTICS ms_17_34 (mcv) ON i17, i34 FROM multistats;
+CREATE STATISTICS ms_17_51 (mcv) ON i17, i51 FROM multistats;
+CREATE STATISTICS ms_19_38 (mcv) ON i19, i38 FROM multistats;
+CREATE STATISTICS ms_19_57 (mcv) ON i19, i57 FROM multistats;
+-- estimates with multiple strongly correlated pairs of mcv statistics to
choose from
+ANALYZE multistats;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i2, i34');
+ estimated | actual
+-----------+--------
+ 68 | 34
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i2, i38');
+ estimated | actual
+-----------+--------
+ 76 | 38
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i34');
+ estimated | actual
+-----------+--------
+ 578 | 34
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i19, i38');
+ estimated | actual
+-----------+--------
+ 722 | 38
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i19, i34, i38');
+ estimated | actual
+-----------+--------
+ 50000 | 646
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i3, i51');
+ estimated | actual
+-----------+--------
+ 153 | 51
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i3, i57');
+ estimated | actual
+-----------+--------
+ 171 | 57
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i51');
+ estimated | actual
+-----------+--------
+ 867 | 51
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i19, i57');
+ estimated | actual
+-----------+--------
+ 1083 | 57
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i19, i51, i57');
+ estimated | actual
+-----------+--------
+ 50000 | 969
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i2 = i34');
+ estimated | actual
+-----------+--------
+ 2500 | 29411
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i2 = i38');
+ estimated | actual
+-----------+--------
+ 2500 | 26315
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i34');
+ estimated | actual
+-----------+--------
+ 2500 | 250001
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i19 = i38');
+ estimated | actual
+-----------+--------
+ 2500 | 250001
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i34
AND i19 = i38');
+ estimated | actual
+-----------+--------
+ 12 | 125387
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i34
OR i19 = i38');
+ estimated | actual
+-----------+--------
+ 4988 | 374615
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i3 = i51');
+ estimated | actual
+-----------+--------
+ 2500 | 29411
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i3 = i57');
+ estimated | actual
+-----------+--------
+ 2500 | 26315
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i51');
+ estimated | actual
+-----------+--------
+ 2500 | 166667
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i19 = i57');
+ estimated | actual
+-----------+--------
+ 2500 | 166667
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i51
AND i19 = i57');
+ estimated | actual
+-----------+--------
+ 12 | 55727
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i51
OR i19 = i57');
+ estimated | actual
+-----------+--------
+ 4988 | 277607
+(1 row)
+
+DROP TABLE multistats;
-- functional dependencies tests
CREATE TABLE functional_dependencies (
filler1 TEXT,
diff --git a/src/test/regress/sql/stats_ext.sql
b/src/test/regress/sql/stats_ext.sql
index 040ee97a1e..7eee03513a 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -224,6 +224,84 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM
ndistinct GROUP BY b, c
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a,
d');
+-- muliple extended statistics
+CREATE TABLE multistats (
+ i2 INTEGER,
+ i3 INTEGER,
+ i17 INTEGER,
+ i19 INTEGER,
+ i34 INTEGER,
+ i38 INTEGER,
+ i51 INTEGER,
+ i57 INTEGER
+);
+
+INSERT INTO multistats (i2, i3, i17, i19, i34, i38, i51, i57)
+ SELECT i % 2, i % 3, i % 17, i % 19, i % 34, i % 38, i % 51, i % 57
+ FROM generate_series(1, 500000) s(i);
+
+-- estimates without mcv statistics
+ANALYZE multistats;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i2, i34');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i2, i38');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i34');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i19, i38');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i19, i34, i38');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i3, i51');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i3, i57');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i51');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i19, i57');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i19, i51, i57');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i2 = i34');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i2 = i38');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i34');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i19 = i38');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i34
AND i19 = i38');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i34
OR i19 = i38');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i3 = i51');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i3 = i57');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i51');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i19 = i57');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i51
AND i19 = i57');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i51
OR i19 = i57');
+
+-- create mcv statistics over pairs of strongly correlated columns
+CREATE STATISTICS ms_2_34 (mcv) ON i2, i34 FROM multistats;
+CREATE STATISTICS ms_2_38 (mcv) ON i2, i38 FROM multistats;
+CREATE STATISTICS ms_3_51 (mcv) ON i3, i51 FROM multistats;
+CREATE STATISTICS ms_3_57 (mcv) ON i3, i57 FROM multistats;
+CREATE STATISTICS ms_17_34 (mcv) ON i17, i34 FROM multistats;
+CREATE STATISTICS ms_17_51 (mcv) ON i17, i51 FROM multistats;
+CREATE STATISTICS ms_19_38 (mcv) ON i19, i38 FROM multistats;
+CREATE STATISTICS ms_19_57 (mcv) ON i19, i57 FROM multistats;
+
+-- estimates with multiple strongly correlated pairs of mcv statistics to
choose from
+ANALYZE multistats;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i2, i34');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i2, i38');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i34');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i19, i38');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i19, i34, i38');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i3, i51');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i3, i57');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i51');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i19, i57');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM multistats GROUP BY
i17, i19, i51, i57');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i2 = i34');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i2 = i38');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i34');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i19 = i38');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i34
AND i19 = i38');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i34
OR i19 = i38');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i3 = i51');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i3 = i57');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i51');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i19 = i57');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i51
AND i19 = i57');
+SELECT * FROM check_estimated_rows('SELECT * FROM multistats WHERE i17 = i51
OR i19 = i57');
+
+DROP TABLE multistats;
+
-- functional dependencies tests
CREATE TABLE functional_dependencies (
filler1 TEXT,