Re: allow partial union-all and improve parallel subquery costing

Luc Vlaming Wed, 14 Oct 2020 00:39:04 -0700

Hi,

It seems I ran the wrong make checks to verify everything is correct (make 
check instead
of make installcheck-world) and this uncovered another regress test change. I 
also noticed
the statistics are sometimes giving different row count results so I increased 
the row
statistics target to make sure the regress output is stable. Updated patch 
attached which
now successfully runs installcheck-world for v13 and master.

Kind regards,
Luc

________________________________________
From: Luc Vlaming <l...@swarm64.com>
Sent: Tuesday, October 13, 2020 10:57 AM
To: pgsql-hackers
Subject: allow partial union-all and improve parallel subquery costing

Hi,

While developing some improvements for TPC-DS queries I found out that with
UNION ALL partial paths are not emitted. Whilst fixing that I also came across
the subquery costing which does not seem to consider parallelism when doing
the costing.

I added a simplified testcase in pg-regress to show this goes wrong, and
attached also a before and after explain output of tpc-ds SF100 query 5
based on version 12.4.

I hope I followed all etiquette and these kind of improvements are welcome.

Kind regards,
Luc
Swarm64

From 622903bff2108cb15d8fcf92b327f5ef00d41daa Mon Sep 17 00:00:00 2001
From: Luc Vlaming <l...@swarm64.com>
Date: Wed, 14 Oct 2020 09:22:46 +0200
Subject: [PATCH v2] Allow partial UNION ALL; improve parallel subquery costing

---
 src/backend/optimizer/path/costsize.c         | 11 ++++
 src/backend/optimizer/prep/prepunion.c        |  4 ++
 .../regress/expected/incremental_sort.out     | 10 ++--
 src/test/regress/expected/union.out           | 52 +++++++++++++++++++
 src/test/regress/sql/union.sql                | 37 +++++++++++++
 5 files changed, 108 insertions(+), 6 deletions(-)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index f39e6a9f80..c2018b3401 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1318,6 +1318,17 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
 	startup_cost += path->path.pathtarget->cost.startup;
 	run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
 
+	/* Adjust costing for parallelism, if used. */
+	if (path->path.parallel_workers > 0)
+	{
+		double		parallel_divisor = get_parallel_divisor(&path->path);
+
+		path->path.rows = clamp_row_est(path->path.rows / parallel_divisor);
+
+		/* The CPU cost is divided among all the workers. */
+		run_cost /= parallel_divisor;
+	}
+
 	path->path.startup_cost += startup_cost;
 	path->path.total_cost += startup_cost + run_cost;
 }
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 745f443e5c..4001eb87f3 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -678,6 +678,10 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 							   NIL, NULL,
 							   parallel_workers, enable_parallel_append,
 							   NIL, -1);
+
+		if (op->all && enable_parallel_append)
+			add_partial_path(result_rel, ppath);
+
 		ppath = (Path *)
 			create_gather_path(root, result_rel, ppath,
 							   result_rel->reltarget, NULL, NULL);
diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out
index e376ea1276..c9596b8e12 100644
--- a/src/test/regress/expected/incremental_sort.out
+++ b/src/test/regress/expected/incremental_sort.out
@@ -1459,13 +1459,11 @@ explain (costs off) select * from t union select * from t order by 1,3;
    ->  Unique
          ->  Sort
                Sort Key: t.a, t.b, t.c
-               ->  Append
-                     ->  Gather
-                           Workers Planned: 2
+               ->  Gather
+                     Workers Planned: 2
+                     ->  Parallel Append
                            ->  Parallel Seq Scan on t
-                     ->  Gather
-                           Workers Planned: 2
                            ->  Parallel Seq Scan on t t_1
-(13 rows)
+(11 rows)
 
 drop table t;
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
index 6e72e92d80..cd553c9c0c 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -1052,3 +1052,55 @@ where (x = 0) or (q1 >= q2 and q1 <= q2);
  4567890123456789 |  4567890123456789 | 1
 (6 rows)
 
+-- Test handling of appendrel with different types which disables the path flattening and
+-- forces a subquery node. for the subquery node ensure the rowcounts are correct.
+create function check_estimated_rows(text) returns table (estimated int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain %s', $1)
+    loop
+        tmp := regexp_match(ln, 'rows=(\d*)');
+        return query select tmp[1]::int;
+    end loop;
+end;
+$$;
+set min_parallel_table_scan_size = 0;
+set parallel_setup_cost = 0;
+set parallel_tuple_cost = 0;
+set default_statistics_target = 1000;
+explain (costs off)
+select *, 0::int from tenk1 a
+union all
+select *, 1::bigint from tenk1 b;
+                   QUERY PLAN                   
+------------------------------------------------
+ Gather
+   Workers Planned: 2
+   ->  Parallel Append
+         ->  Subquery Scan on "*SELECT* 1"
+               ->  Parallel Seq Scan on tenk1 a
+         ->  Parallel Seq Scan on tenk1 b
+(6 rows)
+
+select check_estimated_rows('select *, 0::int from tenk1 a union all select *, 1::bigint from tenk1 b;');
+ check_estimated_rows 
+----------------------
+                19990
+                     
+                 8330
+                 4165
+                 4165
+                 4165
+(6 rows)
+
+reset parallel_setup_cost;
+reset parallel_tuple_cost;
+reset min_parallel_table_scan_size;
+reset default_statistics_target;
+drop function check_estimated_rows(text);
diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql
index 5f4881d594..7b4e2188f0 100644
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@@ -440,3 +440,40 @@ select * from
    union all
    select *, 1 as x from int8_tbl b) ss
 where (x = 0) or (q1 >= q2 and q1 <= q2);
+
+-- Test handling of appendrel with different types which disables the path flattening and
+-- forces a subquery node. for the subquery node ensure the rowcounts are correct.
+create function check_estimated_rows(text) returns table (estimated int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain %s', $1)
+    loop
+        tmp := regexp_match(ln, 'rows=(\d*)');
+        return query select tmp[1]::int;
+    end loop;
+end;
+$$;
+
+set min_parallel_table_scan_size = 0;
+set parallel_setup_cost = 0;
+set parallel_tuple_cost = 0;
+set default_statistics_target = 1000;
+
+explain (costs off)
+select *, 0::int from tenk1 a
+union all
+select *, 1::bigint from tenk1 b;
+
+select check_estimated_rows('select *, 0::int from tenk1 a union all select *, 1::bigint from tenk1 b;');
+
+reset parallel_setup_cost;
+reset parallel_tuple_cost;
+reset min_parallel_table_scan_size;
+reset default_statistics_target;
+drop function check_estimated_rows(text);
-- 
2.25.1

Re: allow partial union-all and improve parallel subquery costing

Reply via email to