On 2019/03/31 1:06, Amit Langote wrote: > On Sun, Mar 31, 2019 at 12:11 AM Tom Lane <t...@sss.pgh.pa.us> wrote: >> Amit Langote <amitlangot...@gmail.com> writes: >>> I think the performance results did prove that degradation due to >>> those loops over part_rels becomes significant for very large >>> partition counts. Is there a better solution than the bitmapset that >>> you have in mind? >> >> Hm, I didn't see much degradation in what you posted in >> <5c83dbca-12b5-1acf-0e85-58299e464...@lab.ntt.co.jp>. > > Sorry that I didn't mention the link to begin with, but I meant to > point to numbers that I reported on Monday this week. > > https://www.postgresql.org/message-id/19f54c17-1619-b228-10e5-ca343be6a4e8%40lab.ntt.co.jp > > You were complaining of the bitmapset being useless overhead for small > partition counts, but the numbers I get tend to suggest that any > degradation in performance is within noise range, whereas the > performance benefit from having them looks pretty significant for very > large partition counts. > >> I am curious as to why there seems to be more degradation >> for hash cases, as per Yoshikazu-san's results in >> <0F97FA9ABBDBE54F91744A9B37151A512BAC60@g01jpexmbkw24>, >> but whatever's accounting for the difference probably >> is not that. > > I suspected it may have been the lack of bitmapsets, but maybe only > Imai-san could've confirmed that by applying the live_parts patch too.
Yeah, I forgot to applying live_parts patch. I did same test again which I did for hash before. (BTW, thanks for committing speeding up patches!) [HEAD(428b260)] nparts TPS ====== ===== 2: 13134 (13240, 13290, 13071, 13172, 12896) 1024: 12627 (12489, 12635, 12716, 12732, 12562) 8192: 10289 (10216, 10265, 10171, 10278, 10514) [HEAD(428b260) + live_parts.diff] nparts TPS ====== ===== 2: 13277 (13112, 13290, 13241, 13360, 13382) 1024: 12821 (12930, 12849, 12909, 12700, 12716) 8192: 11102 (11134, 11158, 11114, 10997, 11109) Degradations of performance are below. My test results from above (with live_parts, HEAD(428b260) + live_parts.diff) nparts live_parts HEAD ====== ========== ==== 2: 13277 13134 1024: 12821 12627 8192: 11102 10289 11102/13277 = 83.6 % Amit-san's test results (with live_parts) > nparts v38 HEAD > ====== ==== ==== > 2 2971 2969 > 8 2980 1949 > 32 2955 733 > 128 2946 145 > 512 2924 11 > 1024 2986 3 > 4096 2702 0 > 8192 2531 OOM 2531/2971 = 85.2 % My test results I posted before (without live_parts) > nparts v38 HEAD > ====== ==== ==== > 0: 10538 10487 > 2: 6942 7028 > 4: 7043 5645 > 8: 6981 3954 > 16: 6932 2440 > 32: 6897 1243 > 64: 6897 309 > 128: 6753 120 > 256: 6727 46 > 512: 6708 12 > 1024: 6063 3 > 2048: 5894 1 > 4096: 5374 OOM > 8192: 4572 OOM 4572/6942 = 65.9 % Certainly, using bitmapset contributes to the performance when scanning one partition(few partitions) from large partitions. Thanks -- Imai Yoshikazu
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 34cc7da..e847655 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -1516,6 +1516,9 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, populate_joinrel_with_paths(root, child_rel1, child_rel2, child_joinrel, child_sjinfo, child_restrictlist); + if(!IS_DUMMY_REL(child_joinrel)) + joinrel->live_parts = bms_add_member(joinrel->live_parts, + cnt_parts); } } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index f08f1cd..9ddf42a 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -7107,7 +7107,9 @@ apply_scanjoin_target_to_paths(PlannerInfo *root, int partition_idx; /* Adjust each partition. */ - for (partition_idx = 0; partition_idx < rel->nparts; partition_idx++) + partition_idx = -1; + while ((partition_idx = bms_next_member(rel->live_parts, + partition_idx)) >= 0) { RelOptInfo *child_rel = rel->part_rels[partition_idx]; AppendRelInfo **appinfos; @@ -7115,9 +7117,7 @@ apply_scanjoin_target_to_paths(PlannerInfo *root, List *child_scanjoin_targets = NIL; ListCell *lc; - /* Pruned or dummy children can be ignored. */ - if (child_rel == NULL || IS_DUMMY_REL(child_rel)) - continue; + Assert(child_rel != NULL); /* Translate scan/join targets for this child. */ appinfos = find_appinfos_by_relids(root, child_rel->relids, @@ -7197,7 +7197,6 @@ create_partitionwise_grouping_paths(PlannerInfo *root, PartitionwiseAggregateType patype, GroupPathExtraData *extra) { - int nparts = input_rel->nparts; int cnt_parts; List *grouped_live_children = NIL; List *partially_grouped_live_children = NIL; @@ -7209,7 +7208,9 @@ create_partitionwise_grouping_paths(PlannerInfo *root, partially_grouped_rel != NULL); /* Add paths for partitionwise aggregation/grouping. */ - for (cnt_parts = 0; cnt_parts < nparts; cnt_parts++) + cnt_parts = -1; + while ((cnt_parts = bms_next_member(input_rel->live_parts, + cnt_parts)) >= 0) { RelOptInfo *child_input_rel = input_rel->part_rels[cnt_parts]; PathTarget *child_target = copy_pathtarget(target); @@ -7219,9 +7220,8 @@ create_partitionwise_grouping_paths(PlannerInfo *root, RelOptInfo *child_grouped_rel; RelOptInfo *child_partially_grouped_rel; - /* Pruned or dummy children can be ignored. */ - if (child_input_rel == NULL || IS_DUMMY_REL(child_input_rel)) - continue; + /* A live partition must have a RelOptInfo. */ + Assert(child_input_rel != NULL); /* * Copy the given "extra" structure as is and then override the diff --git a/src/backend/optimizer/util/inherit.c b/src/backend/optimizer/util/inherit.c index ccc8c11..c010599 100644 --- a/src/backend/optimizer/util/inherit.c +++ b/src/backend/optimizer/util/inherit.c @@ -328,6 +328,12 @@ expand_partitioned_rtentry(PlannerInfo *root, RelOptInfo *relinfo, */ live_parts = prune_append_rel_partitions(relinfo); + /* + * Later steps that loop over part_rels should use these indexes + * to access unpruned partitions. + */ + relinfo->live_parts = live_parts; + /* Expand simple_rel_array and friends to hold child objects. */ num_live_parts = bms_num_members(live_parts); if (num_live_parts > 0) diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index f86f39c..c872fcf 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -1774,6 +1774,9 @@ build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel, joinrel->partexprs[cnt] = partexpr; joinrel->nullable_partexprs[cnt] = nullable_partexpr; } + + /* Partitions will be added by try_partitionwise_join. */ + joinrel->live_parts = NULL; } /* diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c index 59f34f5..14e57e8 100644 --- a/src/backend/partitioning/partprune.c +++ b/src/backend/partitioning/partprune.c @@ -479,30 +479,23 @@ make_partitionedrel_pruneinfo(PlannerInfo *root, RelOptInfo *parentrel, subpart_map = (int *) palloc(nparts * sizeof(int)); memset(subpart_map, -1, nparts * sizeof(int)); relid_map = (Oid *) palloc0(nparts * sizeof(Oid)); - present_parts = NULL; + present_parts = bms_copy(subpart->live_parts); - for (i = 0; i < nparts; i++) + i = -1; + while ((i = bms_next_member(present_parts, i)) >= 0) { RelOptInfo *partrel = subpart->part_rels[i]; int subplanidx; int subpartidx; - /* Skip processing pruned partitions. */ - if (partrel == NULL) - continue; - + Assert(partrel != NULL); subplan_map[i] = subplanidx = relid_subplan_map[partrel->relid] - 1; subpart_map[i] = subpartidx = relid_subpart_map[partrel->relid] - 1; relid_map[i] = planner_rt_fetch(partrel->relid, root)->relid; - if (subplanidx >= 0) - { - present_parts = bms_add_member(present_parts, i); - /* Record finding this subplan */ + /* Record finding this subplan */ + if (subplanidx >= 0) subplansfound = bms_add_member(subplansfound, subplanidx); - } - else if (subpartidx >= 0) - present_parts = bms_add_member(present_parts, i); } /* Record the maps and other information. */ diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 88c8973..dbce41f 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -712,6 +712,10 @@ typedef struct RelOptInfo List *partition_qual; /* partition constraint */ struct RelOptInfo **part_rels; /* Array of RelOptInfos of partitions, * stored in the same order of bounds */ + Bitmapset *live_parts; /* Indexes into part_rels of the non-NULL + * RelOptInfos of unpruned partitions; exists + * to avoid having to iterate over the entire + * part_rels array to filter NULL entries. */ List **partexprs; /* Non-nullable partition key expressions. */ List **nullable_partexprs; /* Nullable partition key expressions. */ List *partitioned_child_rels; /* List of RT indexes. */