On Fri, 1 Aug 2025 00:28:30 +0900 Yugo Nagata <nag...@sraoss.co.jp> wrote:
> Hi, > > On Tue, 24 Jun 2025 17:05:33 +0900 > Yugo Nagata <nag...@sraoss.co.jp> wrote: > > > Instead, I'm thinking of an alternative approach: expanding the expression > > at the time statistics are collected. > > I've attached a new patch in this approache. > > This allows to collect statistics on virtual generated columns. > > During ANALYZE, generation expressions are expanded, and statistics are > computed > using compute_expr_stats(). To support this, both compute_expr_stats() and > AnlExprData > are now exported from extended_stats.c. However, since they are no longer > specific > to extended statistics, it might be better to move them to analyze.c and > vacuum.h. > > To enable the optimizer to make use of these statistics, a new field named > virtual_gencols is added to RelOptInfo. This field holds the expressions of > virtual generated columns in the table. In examine_variable(), if an > expression > in a WHERE clause matches a virtual generated column, the corresponding > statistics > are used for that expression. > > Example: > > - Before applying the patch, the cardinality estimate is erroneous. > > test=# create table t (i int, j int generated always as (i*10) virtual); > CREATE TABLE > test=# insert into t select generate_series(1,1000); > INSERT 0 1000 > test=# insert into t select 1 from generate_series(1,1000); > INSERT 0 1000 > test=# analyze t; > ANALYZE > test=# explain analyze select * from t where j = 10; > QUERY PLAN > > ------------------------------------------------------------------------------------------------- > Seq Scan on t (cost=0.00..0.02 rows=1 width=8) (actual time=0.031..0.806 > rows=1001.00 loops=1) > Filter: ((i * 10) = 10) > Rows Removed by Filter: 999 > Buffers: shared hit=9 > Planning: > Buffers: shared hit=10 > Planning Time: 0.299 ms > Execution Time: 0.948 ms > (8 rows) > > > - After applying the patch, the cardinality estimate is correct. > > test=# analyze t; > ANALYZE > test=# explain analyze select * from t where j = 10; > QUERY PLAN > > ----------------------------------------------------------------------------------------------------- > Seq Scan on t (cost=0.00..41.50 rows=1001 width=8) (actual > time=0.034..0.871 rows=1001.00 loops=1) > Filter: ((i * 10) = 10) > Rows Removed by Filter: 999 > Buffers: shared hit=9 > Planning: > Buffers: shared hit=6 > Planning Time: 0.374 ms > Execution Time: 1.028 ms > (8 rows) > > > Note that the patch is still a work in progress, so documentation and tests > are not included. I've attached an updated patch. I modified the documentation to remove the statement that virtual generated columns do not have statistics. In addition, I added a test to ensure that statistics on virtual generated columns are available. Regards, Yugo Nagata -- Yugo Nagata <nag...@sraoss.co.jp>
>From f60edb8ce4b2fa135dc35d8e98d778e3e502cc5b Mon Sep 17 00:00:00 2001 From: Yugo Nagata <nag...@sraoss.co.jp> Date: Fri, 18 Jul 2025 09:58:56 +0900 Subject: [PATCH v2] Allow to collect statistics on virtual generated columns During ANALYZE, generation expressions are expanded, and statistics are computed using compute_expr_stats(). To support this, both compute_expr_stats() and AnlExprData are now exported from extended_stats.c. To enable the optimizer to make use of these statistics, a new field named virtual_gencols is added to RelOptInfo. This field holds the expressions of virtual generated columns in the table. In examine_variable(), if an expression in a WHERE clause matches a virtual generated column, the corresponding statistics are used for that expression. --- doc/src/sgml/ref/alter_table.sgml | 9 +-- src/backend/commands/analyze.c | 32 ++++++---- src/backend/optimizer/util/plancat.c | 60 +++++++++++++++++++ src/backend/statistics/extended_stats.c | 11 +--- src/backend/utils/adt/selfuncs.c | 44 +++++++++++++- src/include/nodes/pathnodes.h | 19 ++++++ .../statistics/extended_stats_internal.h | 9 +++ .../regress/expected/generated_virtual.out | 7 +++ src/test/regress/sql/generated_virtual.sql | 3 + 9 files changed, 166 insertions(+), 28 deletions(-) diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml index 8867da6c693..bffb07775f7 100644 --- a/doc/src/sgml/ref/alter_table.sgml +++ b/doc/src/sgml/ref/alter_table.sgml @@ -210,8 +210,6 @@ WITH ( MODULUS <replaceable class="parameter">numeric_literal</replaceable>, REM When this form is used, the column's statistics are removed, so running <link linkend="sql-analyze"><command>ANALYZE</command></link> on the table afterwards is recommended. - For a virtual generated column, <command>ANALYZE</command> - is not necessary because such columns never have statistics. </para> </listitem> </varlistentry> @@ -275,12 +273,9 @@ WITH ( MODULUS <replaceable class="parameter">numeric_literal</replaceable>, REM </para> <para> - When this form is used on a stored generated column, its statistics - are removed, so running - <link linkend="sql-analyze"><command>ANALYZE</command></link> + When this form is used, the column's statistics are removed, + so running <link linkend="sql-analyze"><command>ANALYZE</command></link> on the table afterwards is recommended. - For a virtual generated column, <command>ANALYZE</command> - is not necessary because such columns never have statistics. </para> </listitem> </varlistentry> diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 40d66537ad7..438f9595e8e 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -41,6 +41,7 @@ #include "parser/parse_oper.h" #include "parser/parse_relation.h" #include "pgstat.h" +#include "rewrite/rewriteHandler.h" #include "statistics/extended_stats_internal.h" #include "statistics/statistics.h" #include "storage/bufmgr.h" @@ -559,13 +560,28 @@ do_analyze_rel(Relation onerel, const VacuumParams params, { VacAttrStats *stats = vacattrstats[i]; AttributeOpts *aopt; + Form_pg_attribute attr = TupleDescAttr(onerel->rd_att, stats->tupattnum - 1); - stats->rows = rows; - stats->tupDesc = onerel->rd_att; - stats->compute_stats(stats, - std_fetch_func, - numrows, - totalrows); + /* + * For a virtual generated column, compute statistics for the expression value. + */ + if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) + { + AnlExprData *exprdata = (AnlExprData *) palloc0(sizeof(AnlExprData)); + + exprdata->expr = build_generation_expression(onerel, stats->tupattnum); + exprdata->vacattrstat = stats; + compute_expr_stats(onerel, exprdata, 1, rows, numrows); + } + else + { + stats->rows = rows; + stats->tupDesc = onerel->rd_att; + stats->compute_stats(stats, + std_fetch_func, + numrows, + totalrows); + } /* * If the appropriate flavor of the n_distinct option is @@ -1049,10 +1065,6 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr) if (attr->attisdropped) return NULL; - /* Don't analyze virtual generated columns */ - if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) - return NULL; - /* * Get attstattarget value. Set to -1 if null. (Analyze functions expect * -1 to mean use default_statistics_target; see for example diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index c6a58afc5e5..68f7349ffc8 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -42,6 +42,7 @@ #include "parser/parse_relation.h" #include "parser/parsetree.h" #include "partitioning/partdesc.h" +#include "rewrite/rewriteHandler.h" #include "rewrite/rewriteManip.h" #include "statistics/statistics.h" #include "storage/bufmgr.h" @@ -77,6 +78,7 @@ static List *get_relation_constraints(PlannerInfo *root, bool include_partition); static List *build_index_tlist(PlannerInfo *root, IndexOptInfo *index, Relation heapRelation); +static List *get_relation_virtual_gencols(RelOptInfo *rel, Relation relation); static List *get_relation_statistics(RelOptInfo *rel, Relation relation); static void set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel, Relation relation); @@ -508,6 +510,9 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, rel->indexlist = indexinfos; + /* Make list of virtual generated columns */ + rel->virtual_gencols = get_relation_virtual_gencols(rel, relation); + rel->statlist = get_relation_statistics(rel, relation); /* Grab foreign-table info using the relcache, while we have it */ @@ -1482,6 +1487,61 @@ get_relation_constraints(PlannerInfo *root, return result; } +/* + * get_relation_virtual_gencols + * Retrieve virtual generated columns defined on the table. + * + * Returns a List (possibly empty) of VirtualGeneratedColumnInfoInfo objects + * containing the generation expressions. Each one has been processed by + * eval_const_expressions(), and its Vars are changed to have the varno + * indicated by rel->relid. This allows the expressions to be easily + * compared to expressions taken from WHERE. + */ +static List *get_relation_virtual_gencols(RelOptInfo *rel, Relation relation) +{ + TupleDesc tupdesc = RelationGetDescr(relation); + Index varno = rel->relid; + List *virtual_gencols = NIL; + + if (tupdesc->constr && tupdesc->constr->has_generated_virtual) + { + for (int i = 0; i < tupdesc->natts; i++) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, i); + + if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) + { + VirtualGeneratedColumnInfo *info; + + info = makeNode(VirtualGeneratedColumnInfo); + info->attno = attr->attnum; + info->expr = build_generation_expression(relation, attr->attnum); + + /* + * Run the expressions through eval_const_expressions. This is + * not just an optimization, but is necessary, because the + * planner will be comparing them to similarly-processed qual + * clauses, and may fail to detect valid matches without this. + * We must not use canonicalize_qual, however, since these + * aren't qual expressions. + */ + info->expr = eval_const_expressions(NULL, info->expr); + + /* May as well fix opfuncids too */ + fix_opfuncids(info->expr); + + /* Fix Vars to have the desired varno */ + if (varno != 1) + ChangeVarNodes((Node *) info->expr, 1, varno, 0); + + virtual_gencols = lappend(virtual_gencols, info); + } + } + } + + return virtual_gencols; +} + /* * Try loading data for the statistics object. * diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index a8b63ec0884..e26352cacec 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -82,15 +82,6 @@ static void statext_store(Oid statOid, bool inh, static int statext_compute_stattarget(int stattarget, int nattrs, VacAttrStats **stats); -/* Information needed to analyze a single simple expression. */ -typedef struct AnlExprData -{ - Node *expr; /* expression to analyze */ - VacAttrStats *vacattrstat; /* statistics attrs to analyze */ -} AnlExprData; - -static void compute_expr_stats(Relation onerel, AnlExprData *exprdata, - int nexprs, HeapTuple *rows, int numrows); static Datum serialize_expr_stats(AnlExprData *exprdata, int nexprs); static Datum expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); static AnlExprData *build_expr_data(List *exprs, int stattarget); @@ -2103,7 +2094,7 @@ examine_opclause_args(List *args, Node **exprp, Const **cstp, /* * Compute statistics about expressions of a relation. */ -static void +void compute_expr_stats(Relation onerel, AnlExprData *exprdata, int nexprs, HeapTuple *rows, int numrows) { diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 17fbfa9b410..00e3972ff4f 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -5408,6 +5408,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, */ ListCell *ilist; ListCell *slist; + ListCell *vlist; Oid userid; /* @@ -5576,6 +5577,46 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, break; } + /* + * Search virtual generated columns for one with a matching expression + * and use the statistics collected for it if we have. + */ + foreach(vlist, onerel->virtual_gencols) + { + VirtualGeneratedColumnInfo *info = (VirtualGeneratedColumnInfo *) lfirst(vlist); + Node *expr = info->expr; + + /* + * Stop once we've found statistics for the expression (either + * for a virtual generated columns or an index in the preceding + * loop). + */ + if (vardata->statsTuple) + break; + + /* strip RelabelType before comparing it */ + if (expr && IsA(expr, RelabelType)) + expr = (Node *) ((RelabelType *) expr)->arg; + + if (equal(node, expr)) + { + Var *var = makeVar(onerel->relid, + info->attno, + vardata->atttype, + vardata->atttypmod, + exprCollation(node), + 0); + /* + * There cannot be a unique constraint on a virtual generated column. + * Other fields other than the stats tuple must be already set. + */ + vardata->isunique = false; + + /* Try to locate some stats */ + examine_simple_variable(root, var, vardata); + } + } + /* * Search extended statistics for one with a matching expression. * There might be multiple ones, so just grab the first one. In the @@ -5591,7 +5632,8 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, /* * Stop once we've found statistics for the expression (either - * from extended stats, or for an index in the preceding loop). + * from extended stats, or for an index or a virtual generated + * column in the preceding loop). */ if (vardata->statsTuple) break; diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index ad2726f026f..433b7cc6d50 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -970,6 +970,8 @@ typedef struct RelOptInfo List *indexlist; /* list of StatisticExtInfo */ List *statlist; + /* list of VirtualGeneratedColumnInfo */ + List *virtual_gencols; /* size estimates derived from pg_class */ BlockNumber pages; Cardinality tuples; @@ -1322,6 +1324,23 @@ typedef struct StatisticExtInfo List *exprs; } StatisticExtInfo; +/* + * VirtualGeneratedColumnInfo + * Information about virtual generated columns for planning/optimization + */ +typedef struct VirtualGeneratedColumnInfo +{ + pg_node_attr(no_copy_equal, no_read, no_query_jumble) + + NodeTag type; + + /* attribute number of virtual generated column */ + AttrNumber attno; + + /* generation expression */ + Node *expr; +} VirtualGeneratedColumnInfo; + /* * JoinDomains * diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h index efcb7dc3546..8eaea3b7566 100644 --- a/src/include/statistics/extended_stats_internal.h +++ b/src/include/statistics/extended_stats_internal.h @@ -68,6 +68,12 @@ typedef struct StatsBuildData bool **nulls; } StatsBuildData; +/* Information needed to analyze a single simple expression. */ +typedef struct AnlExprData +{ + Node *expr; /* expression to analyze */ + VacAttrStats *vacattrstat; /* statistics attrs to analyze */ +} AnlExprData; extern MVNDistinct *statext_ndistinct_build(double totalrows, StatsBuildData *data); extern bytea *statext_ndistinct_serialize(MVNDistinct *ndistinct); @@ -127,4 +133,7 @@ extern Selectivity mcv_clause_selectivity_or(PlannerInfo *root, Selectivity *overlap_basesel, Selectivity *totalsel); +extern void +compute_expr_stats(Relation onerel, AnlExprData *exprdata, int nexprs, + HeapTuple *rows, int numrows); #endif /* EXTENDED_STATS_INTERNAL_H */ diff --git a/src/test/regress/expected/generated_virtual.out b/src/test/regress/expected/generated_virtual.out index aca6347babe..4469206dab0 100644 --- a/src/test/regress/expected/generated_virtual.out +++ b/src/test/regress/expected/generated_virtual.out @@ -1493,6 +1493,13 @@ create table gtest32 ( ); insert into gtest32 values (1), (2); analyze gtest32; +-- Ensure that statistics on virtual generated column are available +select count(*) from pg_stats where tablename = 'gtest32'; + count +------- + 5 +(1 row) + -- Ensure that nullingrel bits are propagated into the generation expressions explain (costs off) select sum(t2.b) over (partition by t2.a), diff --git a/src/test/regress/sql/generated_virtual.sql b/src/test/regress/sql/generated_virtual.sql index ba19bc4c701..523ab188dac 100644 --- a/src/test/regress/sql/generated_virtual.sql +++ b/src/test/regress/sql/generated_virtual.sql @@ -817,6 +817,9 @@ create table gtest32 ( insert into gtest32 values (1), (2); analyze gtest32; +-- Ensure that statistics on virtual generated column are available +select count(*) from pg_stats where tablename = 'gtest32'; + -- Ensure that nullingrel bits are propagated into the generation expressions explain (costs off) select sum(t2.b) over (partition by t2.a), -- 2.43.0