[Proposal] Extend TableAM routines for ANALYZE scan

Pengzhou Tang Thu, 05 Dec 2019 02:15:52 -0800

When hacking the Zedstore, we need to get a more accurate statistic for
zedstore and we
faced some restrictions:
1) acquire_sample_rows() always use RelationGetNumberOfBlocks to generate
sampling block
    numbers, this is not friendly for zedstore which wants to use a logical
block number and might also
    not friendly to non-block-oriented Table AMs.
2) columns of zedstore table store separately, so columns in a row have a
different physical position,
    tid in a tuple is invalid for zedstore which means the correlation
statistic is incorrect for zedstore.
3) RelOptInfo->pages is not correct for Zedstore if we only access partial
of the columns which make
   the IO cost much higher than the actual cost.


For 1) and 2), we propose to extend existing ANALYZE-scan table AM routines
in patch
"0001-ANALYZE-tableam-API-change.patch" which add three more APIs:
scan_analyze_beginscan(), scan_analyze_sample_tuple(),
scan_analyze_endscan(). This provides
more convenience and table AMs can take more control of every step of
sampling rows. Meanwhile,
with the new structure named "AcquireSampleContext", we can acquire extra
info (eg: physical position,
physical size) except the real columns values.

For 3), we hope we can have a similar mechanism with RelOptInfo->rows which
is calculated from
 (RelOptInfo->tuples * Selectivity), we can calculate RelOptInfo->pages
with a page selectivity which
is base on the selected zedstore columns.
0002-Planner-can-estimate-the-pages-based-on-the-columns-.patch
shows one idea that adding the `stadiskfrac` to pg_statistic and planner
use it to estimate the
RelOptInfo->pages.

0003-ZedStore-use-extended-ANAlYZE-API.patch is attached to only show how
Zedstore use the
previous patches to achieve:
1. use logical block id to acquire the sample rows.
2. can only acquire sample rows from specified column c1, this is used when
user only analyze table
    on specified columns eg: "analyze zs (c1)".
3 when ANALYZE, zedstore table AM provided extra disksize info, then
ANALYZE compute the
    physical fraction statistic of each column and planner use it to
estimate the IO cost based on
    the selected columns.

Thanks,
Pengzhou

From 8a8f6d14d1a1ddc0be35582d4a17af50ffce986a Mon Sep 17 00:00:00 2001
From: Pengzhou Tang <ptang@pivotal.io>
Date: Wed, 20 Nov 2019 06:42:37 -0500
Subject: [PATCH 1/3] ANALYZE tableam API change

Extended three ANALYZE-related tableam APIs so AMs can take more control
of ANALYZE progress:
- scan_analyze_beginscan() : so AMs can has more flexible sampling strategy
- scan_analyze_sample_tuple() : so ANALYZE can get extra info as needed
- scan_analyze_endscan() :

Also use struct AcquireSampleContex to provide more convenience, with it
tableam analyze routines can provide extra info except the real data,
for example: physical size or compression ratio.
---
 contrib/file_fdw/file_fdw.c              |  36 +++-----
 contrib/postgres_fdw/postgres_fdw.c      |  22 ++---
 src/backend/access/heap/heapam_handler.c |  98 ++++++++++++++++++---
 src/backend/access/table/tableam.c       | 109 +++++++++++++++++++++++
 src/backend/commands/analyze.c           | 144 +++++++++++--------------------
 src/include/access/tableam.h             | 115 ++++++++++++++++++++----
 src/include/foreign/fdwapi.h             |   5 +-
 7 files changed, 367 insertions(+), 162 deletions(-)

diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c
index 549821c..e8937e8 100644
--- a/contrib/file_fdw/file_fdw.c
+++ b/contrib/file_fdw/file_fdw.c
@@ -19,6 +19,7 @@
 #include "access/reloptions.h"
 #include "access/sysattr.h"
 #include "access/table.h"
+#include "access/tableam.h"
 #include "catalog/pg_authid.h"
 #include "catalog/pg_foreign_table.h"
 #include "commands/copy.h"
@@ -158,9 +159,7 @@ static void estimate_costs(PlannerInfo *root, RelOptInfo *baserel,
 						   FileFdwPlanState *fdw_private,
 						   Cost *startup_cost, Cost *total_cost);
 static int	file_acquire_sample_rows(Relation onerel, int elevel,
-									 HeapTuple *rows, int targrows,
-									 double *totalrows, double *totaldeadrows);
-
+									 AcquireSampleContext *context);
 
 /*
  * Foreign-data wrapper handler function: return a struct with pointers
@@ -1093,30 +1092,27 @@ estimate_costs(PlannerInfo *root, RelOptInfo *baserel,
  */
 static int
 file_acquire_sample_rows(Relation onerel, int elevel,
-						 HeapTuple *rows, int targrows,
-						 double *totalrows, double *totaldeadrows)
+						 AcquireSampleContext *context)
 {
 	int			numrows = 0;
 	double		rowstoskip = -1;	/* -1 means not set yet */
 	ReservoirStateData rstate;
-	TupleDesc	tupDesc;
-	Datum	   *values;
-	bool	   *nulls;
 	bool		found;
 	char	   *filename;
 	bool		is_program;
 	List	   *options;
 	CopyState	cstate;
+	TupleTableSlot *slot;
 	ErrorContextCallback errcallback;
 	MemoryContext oldcontext = CurrentMemoryContext;
 	MemoryContext tupcontext;
+	int	targrows = !context->inh ? context->targrows: context->childtargrows;
 
 	Assert(onerel);
 	Assert(targrows > 0);
 
-	tupDesc = RelationGetDescr(onerel);
-	values = (Datum *) palloc(tupDesc->natts * sizeof(Datum));
-	nulls = (bool *) palloc(tupDesc->natts * sizeof(bool));
+	InitSampleKind(context, onerel, SAMPLE_KIND_DATA);
+	slot = context->k_slots[SAMPLE_KIND_DATA];
 
 	/* Fetch options of foreign table */
 	fileGetOptions(RelationGetRelid(onerel), &filename, &is_program, &options);
@@ -1144,8 +1140,6 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 	errcallback.previous = error_context_stack;
 	error_context_stack = &errcallback;
 
-	*totalrows = 0;
-	*totaldeadrows = 0;
 	for (;;)
 	{
 		/* Check for user-requested abort or sleep */
@@ -1155,7 +1149,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 		MemoryContextReset(tupcontext);
 		MemoryContextSwitchTo(tupcontext);
 
-		found = NextCopyFrom(cstate, NULL, values, nulls);
+		found = NextCopyFrom(cstate, NULL, slot->tts_values, slot->tts_isnull);
 
 		MemoryContextSwitchTo(oldcontext);
 
@@ -1170,7 +1164,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 		 */
 		if (numrows < targrows)
 		{
-			rows[numrows++] = heap_form_tuple(tupDesc, values, nulls);
+			RecordSampleKindRow(context, SAMPLE_KIND_DATA, numrows++, false, false);
 		}
 		else
 		{
@@ -1180,7 +1174,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 			 * not-yet-incremented value of totalrows as t.
 			 */
 			if (rowstoskip < 0)
-				rowstoskip = reservoir_get_next_S(&rstate, *totalrows, targrows);
+				rowstoskip = reservoir_get_next_S(&rstate, context->totalrows, targrows);
 
 			if (rowstoskip <= 0)
 			{
@@ -1191,14 +1185,13 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 				int			k = (int) (targrows * sampler_random_fract(rstate.randstate));
 
 				Assert(k >= 0 && k < targrows);
-				heap_freetuple(rows[k]);
-				rows[k] = heap_form_tuple(tupDesc, values, nulls);
+				RecordSampleKindRow(context, SAMPLE_KIND_DATA, k, true, false);
 			}
 
 			rowstoskip -= 1;
 		}
 
-		*totalrows += 1;
+		context->totalrows += 1;
 	}
 
 	/* Remove error callback. */
@@ -1209,9 +1202,6 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 
 	EndCopyFrom(cstate);
 
-	pfree(values);
-	pfree(nulls);
-
 	/*
 	 * Emit some interesting relation info
 	 */
@@ -1219,7 +1209,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 			(errmsg("\"%s\": file contains %.0f rows; "
 					"%d rows in sample",
 					RelationGetRelationName(onerel),
-					*totalrows, numrows)));
+					context->totalrows, numrows)));
 
 	return numrows;
 }
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 3236664..3a4afac 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -15,6 +15,7 @@
 #include "access/htup_details.h"
 #include "access/sysattr.h"
 #include "access/table.h"
+#include "access/tableam.h"
 #include "catalog/pg_class.h"
 #include "commands/defrem.h"
 #include "commands/explain.h"
@@ -462,9 +463,7 @@ static void process_query_params(ExprContext *econtext,
 								 List *param_exprs,
 								 const char **param_values);
 static int	postgresAcquireSampleRowsFunc(Relation relation, int elevel,
-										  HeapTuple *rows, int targrows,
-										  double *totalrows,
-										  double *totaldeadrows);
+										  AcquireSampleContext *context);
 static void analyze_row_processor(PGresult *res, int row,
 								  PgFdwAnalyzeState *astate);
 static HeapTuple make_tuple_from_result_row(PGresult *res,
@@ -4439,9 +4438,7 @@ postgresAnalyzeForeignTable(Relation relation,
  */
 static int
 postgresAcquireSampleRowsFunc(Relation relation, int elevel,
-							  HeapTuple *rows, int targrows,
-							  double *totalrows,
-							  double *totaldeadrows)
+							  AcquireSampleContext *context)
 {
 	PgFdwAnalyzeState astate;
 	ForeignTable *table;
@@ -4452,16 +4449,19 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
 	StringInfoData sql;
 	PGresult   *volatile res = NULL;
 
+	InitSampleKind(context, relation, SAMPLE_KIND_DATA);
+
 	/* Initialize workspace state */
 	astate.rel = relation;
 	astate.attinmeta = TupleDescGetAttInMetadata(RelationGetDescr(relation));
 
-	astate.rows = rows;
-	astate.targrows = targrows;
+	astate.rows = !context->inh ? context->k_rows[SAMPLE_KIND_DATA] :
+					context->k_rows[SAMPLE_KIND_DATA] + context->nchildrows;
+	astate.targrows = !context->inh ? context->targrows : context->childtargrows;
 	astate.numrows = 0;
 	astate.samplerows = 0;
 	astate.rowstoskip = -1;		/* -1 means not set yet */
-	reservoir_init_selection_state(&astate.rstate, targrows);
+	reservoir_init_selection_state(&astate.rstate, astate.targrows);
 
 	/* Remember ANALYZE context, and create a per-tuple temp context */
 	astate.anl_cxt = CurrentMemoryContext;
@@ -4577,10 +4577,10 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
 	ReleaseConnection(conn);
 
 	/* We assume that we have no dead tuple. */
-	*totaldeadrows = 0.0;
+	context->totaldeadrows = 0.0;
 
 	/* We've retrieved all living tuples from foreign server. */
-	*totalrows = astate.samplerows;
+	context->totalrows += astate.samplerows;
 
 	/*
 	 * Emit some interesting relation info
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 43cce6f..a777c98 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -36,6 +36,7 @@
 #include "commands/progress.h"
 #include "executor/executor.h"
 #include "pgstat.h"
+#include "parser/analyze.h"
 #include "storage/bufmgr.h"
 #include "storage/bufpage.h"
 #include "storage/bufmgr.h"
@@ -46,7 +47,7 @@
 #include "utils/builtins.h"
 #include "utils/rel.h"
 
-
+static int	compare_rows(const void *a, const void *b);
 static void reform_and_rewrite_tuple(HeapTuple tuple,
 									 Relation OldHeap, Relation NewHeap,
 									 Datum *values, bool *isnull, RewriteState rwstate);
@@ -977,10 +978,23 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 	pfree(isnull);
 }
 
+static void
+heapam_scan_analyze_beginscan(Relation onerel, AcquireSampleContext *context)
+{
+	context->scan = table_beginscan_analyze(onerel);
+
+	/* initialize the slot to fetch sample rows */
+	InitSampleKind(context, onerel, SAMPLE_KIND_DATA);
+
+	/* initialize the totalblocks analyze can scan */
+	context->totalblocks = RelationGetNumberOfBlocks(onerel);
+}
+
 static bool
-heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
-							   BufferAccessStrategy bstrategy)
+heapam_scan_analyze_next_block(BlockNumber blockno,
+							   AcquireSampleContext *context)
 {
+	TableScanDesc scan = context->scan;
 	HeapScanDesc hscan = (HeapScanDesc) scan;
 
 	/*
@@ -995,7 +1009,7 @@ heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
 	hscan->rs_cblock = blockno;
 	hscan->rs_cindex = FirstOffsetNumber;
 	hscan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM,
-										blockno, RBM_NORMAL, bstrategy);
+										blockno, RBM_NORMAL, context->bstrategy);
 	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
 
 	/* in heap all blocks can contain tuples, so always return true */
@@ -1003,14 +1017,14 @@ heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
 }
 
 static bool
-heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
-							   double *liverows, double *deadrows,
-							   TupleTableSlot *slot)
+heapam_scan_analyze_next_tuple(TransactionId OldestXmin, AcquireSampleContext *context)
 {
+	TableScanDesc scan = context->scan;
 	HeapScanDesc hscan = (HeapScanDesc) scan;
 	Page		targpage;
 	OffsetNumber maxoffset;
 	BufferHeapTupleTableSlot *hslot;
+	TupleTableSlot *slot = context->k_slots[SAMPLE_KIND_DATA];
 
 	Assert(TTS_IS_BUFFERTUPLE(slot));
 
@@ -1036,7 +1050,7 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 		if (!ItemIdIsNormal(itemid))
 		{
 			if (ItemIdIsDead(itemid))
-				*deadrows += 1;
+				context->deadrows += 1;
 			continue;
 		}
 
@@ -1051,13 +1065,13 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 		{
 			case HEAPTUPLE_LIVE:
 				sample_it = true;
-				*liverows += 1;
+				context->liverows += 1;
 				break;
 
 			case HEAPTUPLE_DEAD:
 			case HEAPTUPLE_RECENTLY_DEAD:
 				/* Count dead and recently-dead rows */
-				*deadrows += 1;
+				context->deadrows += 1;
 				break;
 
 			case HEAPTUPLE_INSERT_IN_PROGRESS:
@@ -1083,7 +1097,7 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 				if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data)))
 				{
 					sample_it = true;
-					*liverows += 1;
+					context->liverows += 1;
 				}
 				break;
 
@@ -1112,11 +1126,11 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 				 * concurrent transaction never commits.
 				 */
 				if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data)))
-					*deadrows += 1;
+					context->deadrows += 1;
 				else
 				{
 					sample_it = true;
-					*liverows += 1;
+					context->liverows += 1;
 				}
 				break;
 
@@ -1145,6 +1159,61 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 	return false;
 }
 
+static void 
+heapam_scan_analyze_sample_tuple(Index sample, bool replace, AcquireSampleContext *context)
+{
+	/* increase sample row count */
+	context->samplerows++;
+
+	RecordSampleKindRow(context, SAMPLE_KIND_DATA, sample, replace, true);
+}
+
+static void
+heapam_scan_analyze_endscan(AcquireSampleContext *context)
+{
+	int	targrows = !context->inh ?
+					context->targrows : context->childtargrows;
+	HeapTuple *rows = !context->inh ?
+					context->k_rows[SAMPLE_KIND_DATA] :
+					context->k_rows[SAMPLE_KIND_DATA] + context->nchildrows;
+	/*
+	 * If we didn't find as many tuples as we wanted then we're done. No sort
+	 * is needed, since they're already in order.
+	 *
+	 * Otherwise we need to sort the collected tuples by position
+	 * (itempointer). It's not worth worrying about corner cases where the
+	 * tuples are already sorted.
+	 */
+	if (context->samplerows > targrows)
+		qsort((void *)rows, targrows, sizeof(HeapTuple), compare_rows);
+
+	table_endscan(context->scan);
+}
+
+/*
+ * qsort comparator for sorting rows[] array
+ */
+static int
+compare_rows(const void *a, const void *b)
+{
+	HeapTuple	ha = *(const HeapTuple *) a;
+	HeapTuple	hb = *(const HeapTuple *) b;
+	BlockNumber ba = ItemPointerGetBlockNumber(&ha->t_self);
+	OffsetNumber oa = ItemPointerGetOffsetNumber(&ha->t_self);
+	BlockNumber bb = ItemPointerGetBlockNumber(&hb->t_self);
+	OffsetNumber ob = ItemPointerGetOffsetNumber(&hb->t_self);
+
+	if (ba < bb)
+		return -1;
+	if (ba > bb)
+		return 1;
+	if (oa < ob)
+		return -1;
+	if (oa > ob)
+		return 1;
+	return 0;
+}
+
 static double
 heapam_index_build_range_scan(Relation heapRelation,
 							  Relation indexRelation,
@@ -2533,8 +2602,11 @@ static const TableAmRoutine heapam_methods = {
 	.relation_copy_data = heapam_relation_copy_data,
 	.relation_copy_for_cluster = heapam_relation_copy_for_cluster,
 	.relation_vacuum = heap_vacuum_rel,
+	.scan_analyze_beginscan = heapam_scan_analyze_beginscan,
 	.scan_analyze_next_block = heapam_scan_analyze_next_block,
 	.scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
+	.scan_analyze_sample_tuple = heapam_scan_analyze_sample_tuple,
+	.scan_analyze_endscan = heapam_scan_analyze_endscan,
 	.index_build_range_scan = heapam_index_build_range_scan,
 	.index_validate_scan = heapam_index_validate_scan,
 
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index b9ed336..1d51711 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -24,6 +24,7 @@
 #include "access/heapam.h"		/* for ss_* */
 #include "access/tableam.h"
 #include "access/xact.h"
+#include "catalog/pg_type.h"
 #include "optimizer/plancat.h"
 #include "storage/bufmgr.h"
 #include "storage/shmem.h"
@@ -650,3 +651,111 @@ table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
 	else
 		*allvisfrac = (double) relallvisible / curpages;
 }
+
+/* Create the context to acquire sample rows */
+AcquireSampleContext *
+CreateAcquireSampleContext(Relation onerel,
+						   List *cols,
+						   int targrows,
+						   bool inh,
+						   BufferAccessStrategy strategy)
+{
+	AcquireSampleContext *context;
+	
+	context = (AcquireSampleContext *) palloc(sizeof(AcquireSampleContext));
+	context->cols = cols;
+	context->targrows = targrows;
+	context->bstrategy = strategy;
+
+	/* statistic */
+	context->totalrows = 0;
+	context->totaldeadrows = 0;
+	context->liverows = 0;
+	context->deadrows = 0;
+	context->samplerows = 0;
+
+	/* field for inherit table */
+	context->inh = inh;
+	context->childtargrows = 0;
+	context->nchildrows = 0;
+
+	/* empty all sample type */
+	memset(context->k_slots, 0, NUM_SAMPLEKIND * sizeof(TupleTableSlot *));
+	memset(context->k_rows, 0, NUM_SAMPLEKIND * sizeof(HeapTuple *));
+
+	return context;
+}
+
+void
+DestroyAcquireSampleContext(AcquireSampleContext *context)
+{
+	for (int i = 0; i < NUM_SAMPLEKIND; i++)
+	{
+		TupleTableSlot *slot = context->k_slots[i];
+		if (slot)
+			ExecDropSingleTupleTableSlot(slot);
+	}
+}
+
+void
+InitSampleKind(AcquireSampleContext *context,
+			   Relation onerel,
+			   AcquireSampleKind kind)
+{
+	TupleDesc tupdesc;
+	int attr_cnt = onerel->rd_att->natts;
+
+	/* 
+	 * we already initialized it, this happens in
+	 * acquire_inherited_sample_rows() which use
+	 * the same context to acquire sample from all
+	 * inherited tables.
+	 */
+	if (context->k_slots[kind])
+	{
+		Assert(context->inh);
+		context->liverows = 0;
+		context->deadrows = 0;
+		context->samplerows = 0;
+		return;
+	}
+
+	switch (kind)
+	{
+		case SAMPLE_KIND_DATA:
+			tupdesc = RelationGetDescr(onerel);
+			break;
+		case SAMPLE_KIND_DISKSIZE:
+			tupdesc = CreateTemplateTupleDesc(attr_cnt);
+			for (int i = 1; i <= attr_cnt; i++)
+				TupleDescInitEntry(tupdesc, i, "", FLOAT8OID, -1, 0);
+			break;
+		default:
+			elog(ERROR, "unknow sampling type");
+	}
+
+	context->k_slots[kind] =
+		MakeSingleTupleTableSlot(tupdesc, table_slot_callbacks(onerel));
+	context->k_rows[kind] =
+		(HeapTuple *) palloc(context->targrows * sizeof(HeapTuple));
+}
+
+void
+RecordSampleKindRow(AcquireSampleContext *context,
+					AcquireSampleKind kind,
+					Index sample,
+					bool replace,
+					bool withtid)
+{
+	TupleTableSlot *slot = context->k_slots[kind];
+	HeapTuple *rows = !context->inh ? context->k_rows[kind] :
+		context->k_rows[kind] + context->nchildrows;
+
+	if (replace)
+		heap_freetuple(rows[sample]);
+
+	rows[sample] = ExecCopySlotHeapTuple(slot);
+
+	if (withtid)
+		rows[sample]->t_self = slot->tts_tid;
+}
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 13b93f2..a97297b 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -83,7 +83,6 @@ int			default_statistics_target = 100;
 static MemoryContext anl_context = NULL;
 static BufferAccessStrategy vac_strategy;
 
-
 static void do_analyze_rel(Relation onerel,
 						   VacuumParams *params, List *va_cols,
 						   AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
@@ -95,12 +94,9 @@ static void compute_index_stats(Relation onerel, double totalrows,
 static VacAttrStats *examine_attribute(Relation onerel, int attnum,
 									   Node *index_expr);
 static int	acquire_sample_rows(Relation onerel, int elevel,
-								HeapTuple *rows, int targrows,
-								double *totalrows, double *totaldeadrows);
-static int	compare_rows(const void *a, const void *b);
+								AcquireSampleContext *context);
 static int	acquire_inherited_sample_rows(Relation onerel, int elevel,
-										  HeapTuple *rows, int targrows,
-										  double *totalrows, double *totaldeadrows);
+										  AcquireSampleContext *context);
 static void update_attstats(Oid relid, bool inh,
 							int natts, VacAttrStats **vacattrstats);
 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
@@ -318,6 +314,7 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 	Oid			save_userid;
 	int			save_sec_context;
 	int			save_nestlevel;
+	AcquireSampleContext *sample_context;
 
 	if (inh)
 		ereport(elevel,
@@ -502,18 +499,21 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 	if (targrows < minrows)
 		targrows = minrows;
 
+	/* create context for acquiring sample rows */
+	sample_context = CreateAcquireSampleContext(onerel, va_cols, targrows,
+												inh, vac_strategy);
+
 	/*
 	 * Acquire the sample rows
 	 */
-	rows = (HeapTuple *) palloc(targrows * sizeof(HeapTuple));
 	if (inh)
-		numrows = acquire_inherited_sample_rows(onerel, elevel,
-												rows, targrows,
-												&totalrows, &totaldeadrows);
+		numrows = acquire_inherited_sample_rows(onerel, elevel, sample_context);
 	else
-		numrows = (*acquirefunc) (onerel, elevel,
-								  rows, targrows,
-								  &totalrows, &totaldeadrows);
+		numrows = (*acquirefunc) (onerel, elevel, sample_context); 
+
+	rows = sample_context->k_rows[SAMPLE_KIND_DATA];
+	totalrows = sample_context->totalrows;
+	totaldeadrows = sample_context->totaldeadrows;
 
 	/*
 	 * Compute the statistics.  Temporary results during the calculations for
@@ -592,7 +592,8 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 		 * not for relations representing inheritance trees.
 		 */
 		if (!inh)
-			BuildRelationExtStatistics(onerel, totalrows, numrows, rows,
+			BuildRelationExtStatistics(onerel, totalrows, numrows,
+									   rows,
 									   attr_cnt, vacattrstats);
 	}
 
@@ -690,6 +691,8 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 							pg_rusage_show(&ru0))));
 	}
 
+	DestroyAcquireSampleContext(sample_context);
+
 	/* Roll back any GUC changes executed by index functions */
 	AtEOXact_GUC(false, save_nestlevel);
 
@@ -1020,24 +1023,22 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr)
  */
 static int
 acquire_sample_rows(Relation onerel, int elevel,
-					HeapTuple *rows, int targrows,
-					double *totalrows, double *totaldeadrows)
+					AcquireSampleContext *context)
 {
 	int			numrows = 0;	/* # rows now in reservoir */
 	double		samplerows = 0; /* total # rows collected */
-	double		liverows = 0;	/* # live rows seen */
-	double		deadrows = 0;	/* # dead rows seen */
 	double		rowstoskip = -1;	/* -1 means not set yet */
 	BlockNumber totalblocks;
 	TransactionId OldestXmin;
 	BlockSamplerData bs;
 	ReservoirStateData rstate;
-	TupleTableSlot *slot;
-	TableScanDesc scan;
+	int	targrows = !context->inh ? context->targrows : context->childtargrows;
 
 	Assert(targrows > 0);
 
-	totalblocks = RelationGetNumberOfBlocks(onerel);
+	table_scan_analyze_beginscan(onerel, context);
+
+	totalblocks = context->totalblocks;
 
 	/* Need a cutoff xmin for HeapTupleSatisfiesVacuum */
 	OldestXmin = GetOldestXmin(onerel, PROCARRAY_FLAGS_VACUUM);
@@ -1047,9 +1048,6 @@ acquire_sample_rows(Relation onerel, int elevel,
 	/* Prepare for sampling rows */
 	reservoir_init_selection_state(&rstate, targrows);
 
-	scan = table_beginscan_analyze(onerel);
-	slot = table_slot_create(onerel, NULL);
-
 	/* Outer loop over blocks to sample */
 	while (BlockSampler_HasMore(&bs))
 	{
@@ -1057,10 +1055,10 @@ acquire_sample_rows(Relation onerel, int elevel,
 
 		vacuum_delay_point();
 
-		if (!table_scan_analyze_next_block(scan, targblock, vac_strategy))
+		if (!table_scan_analyze_next_block(targblock, context))
 			continue;
 
-		while (table_scan_analyze_next_tuple(scan, OldestXmin, &liverows, &deadrows, slot))
+		while (table_scan_analyze_next_tuple(OldestXmin, context))
 		{
 			/*
 			 * The first targrows sample rows are simply copied into the
@@ -1076,8 +1074,8 @@ acquire_sample_rows(Relation onerel, int elevel,
 			 */
 			if (numrows < targrows)
 			{
-				rows[numrows] = ExecCopySlotHeapTuple(slot);
-				rows[numrows]->t_self = slot->tts_tid;
+				table_scan_analyze_sample_tuple(numrows, false, context);
+
 				numrows++;
 			}
 			else
@@ -1099,9 +1097,8 @@ acquire_sample_rows(Relation onerel, int elevel,
 					int			k = (int) (targrows * sampler_random_fract(rstate.randstate));
 
 					Assert(k >= 0 && k < targrows);
-					heap_freetuple(rows[k]);
-					rows[k] = ExecCopySlotHeapTuple(slot);
-					rows[k]->t_self = slot->tts_tid;
+
+					table_scan_analyze_sample_tuple(k, true, context);
 				}
 
 				rowstoskip -= 1;
@@ -1111,19 +1108,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 		}
 	}
 
-	ExecDropSingleTupleTableSlot(slot);
-	table_endscan(scan);
-
-	/*
-	 * If we didn't find as many tuples as we wanted then we're done. No sort
-	 * is needed, since they're already in order.
-	 *
-	 * Otherwise we need to sort the collected tuples by position
-	 * (itempointer). It's not worth worrying about corner cases where the
-	 * tuples are already sorted.
-	 */
-	if (numrows == targrows)
-		qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
+	table_scan_analyze_endscan(context);
 
 	/*
 	 * Estimate total numbers of live and dead rows in relation, extrapolating
@@ -1134,13 +1119,13 @@ acquire_sample_rows(Relation onerel, int elevel,
 	 */
 	if (bs.m > 0)
 	{
-		*totalrows = floor((liverows / bs.m) * totalblocks + 0.5);
-		*totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5);
+		context->totalrows += floor((context->liverows / bs.m) * totalblocks + 0.5);
+		context->totaldeadrows += floor((context->deadrows / bs.m) * totalblocks + 0.5);
 	}
 	else
 	{
-		*totalrows = 0.0;
-		*totaldeadrows = 0.0;
+		context->totalrows += 0.0;
+		context->totaldeadrows += 0.0;
 	}
 
 	/*
@@ -1152,36 +1137,14 @@ acquire_sample_rows(Relation onerel, int elevel,
 					"%d rows in sample, %.0f estimated total rows",
 					RelationGetRelationName(onerel),
 					bs.m, totalblocks,
-					liverows, deadrows,
-					numrows, *totalrows)));
+					context->liverows,
+					context->deadrows,
+					numrows,
+					context->totalrows)));
 
 	return numrows;
 }
 
-/*
- * qsort comparator for sorting rows[] array
- */
-static int
-compare_rows(const void *a, const void *b)
-{
-	HeapTuple	ha = *(const HeapTuple *) a;
-	HeapTuple	hb = *(const HeapTuple *) b;
-	BlockNumber ba = ItemPointerGetBlockNumber(&ha->t_self);
-	OffsetNumber oa = ItemPointerGetOffsetNumber(&ha->t_self);
-	BlockNumber bb = ItemPointerGetBlockNumber(&hb->t_self);
-	OffsetNumber ob = ItemPointerGetOffsetNumber(&hb->t_self);
-
-	if (ba < bb)
-		return -1;
-	if (ba > bb)
-		return 1;
-	if (oa < ob)
-		return -1;
-	if (oa > ob)
-		return 1;
-	return 0;
-}
-
 
 /*
  * acquire_inherited_sample_rows -- acquire sample rows from inheritance tree
@@ -1193,17 +1156,16 @@ compare_rows(const void *a, const void *b)
  */
 static int
 acquire_inherited_sample_rows(Relation onerel, int elevel,
-							  HeapTuple *rows, int targrows,
-							  double *totalrows, double *totaldeadrows)
+							  AcquireSampleContext *context)
 {
 	List	   *tableOIDs;
 	Relation   *rels;
 	AcquireSampleRowsFunc *acquirefuncs;
 	double	   *relblocks;
 	double		totalblocks;
-	int			numrows,
-				nrels,
+	int			nrels,
 				i;
+	int			targrows = context->targrows;
 	ListCell   *lc;
 	bool		has_child;
 
@@ -1337,9 +1299,6 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
 	 * rels have radically different free-space percentages, but it's not
 	 * clear that it's worth working harder.)
 	 */
-	numrows = 0;
-	*totalrows = 0;
-	*totaldeadrows = 0;
 	for (i = 0; i < nrels; i++)
 	{
 		Relation	childrel = rels[i];
@@ -1352,17 +1311,15 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
 
 			childtargrows = (int) rint(targrows * childblocks / totalblocks);
 			/* Make sure we don't overrun due to roundoff error */
-			childtargrows = Min(childtargrows, targrows - numrows);
+			childtargrows = Min(childtargrows, targrows - context->nchildrows);
 			if (childtargrows > 0)
 			{
 				int			childrows;
-				double		trows,
-							tdrows;
+
+				context->childtargrows = childtargrows;
 
 				/* Fetch a random sample of the child's rows */
-				childrows = (*acquirefunc) (childrel, elevel,
-											rows + numrows, childtargrows,
-											&trows, &tdrows);
+				childrows = (*acquirefunc) (childrel, elevel, context);
 
 				/* We may need to convert from child's rowtype to parent's */
 				if (childrows > 0 &&
@@ -1380,19 +1337,18 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
 						for (j = 0; j < childrows; j++)
 						{
 							HeapTuple	newtup;
+							HeapTuple	*rows = context->k_rows[SAMPLE_KIND_DATA];
 
-							newtup = execute_attr_map_tuple(rows[numrows + j], map);
-							heap_freetuple(rows[numrows + j]);
-							rows[numrows + j] = newtup;
+							newtup = execute_attr_map_tuple(rows[context->nchildrows + j], map);
+							heap_freetuple(rows[context->nchildrows + j]);
+							rows[context->nchildrows + j] = newtup;
 						}
 						free_conversion_map(map);
 					}
 				}
 
 				/* And add to counts */
-				numrows += childrows;
-				*totalrows += trows;
-				*totaldeadrows += tdrows;
+				context->nchildrows += childrows;
 			}
 		}
 
@@ -1403,7 +1359,7 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
 		table_close(childrel, NoLock);
 	}
 
-	return numrows;
+	return context->nchildrows;
 }
 
 
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 0b8dcc6..9421aac 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -38,6 +38,51 @@ struct TBMIterateResult;
 struct VacuumParams;
 struct ValidateIndexState;
 
+typedef enum AcquireSampleKind 
+{
+	SAMPLE_KIND_DATA = 0,	/* real data per column */
+	SAMPLE_KIND_DISKSIZE,	/* physical size per column */
+
+	NUM_SAMPLEKIND			/* must be last */
+} AcquireSampleKind;
+
+typedef struct AcquireSampleContext
+{
+	int targrows;
+	List *cols;
+	BufferAccessStrategy bstrategy;
+
+	/* filled by table AM analyze routines */
+	BlockNumber	totalblocks;
+	TableScanDesc scan;
+
+	double	totalrows;
+	double 	totaldeadrows;
+	double 	liverows;
+	double 	deadrows;
+	int		samplerows; /* number of sample rows include replaced ones */
+
+	/* fields for inherit table */
+	bool	inh;
+	int		childtargrows;
+	int		nchildrows;		/* nrows already acquired */
+
+	/*
+	 * used by table AM analyze routines to store
+	 * the temporary tuple for different types of
+	 * sample rows, the tuple is finally stored to
+	 * k_rows[NUM_SAMPLEKIND] if the tuple is
+	 * randomly selected.
+	 */
+	TupleTableSlot* k_slots[NUM_SAMPLEKIND];
+
+	/* 
+	 * stores the final sample rows which will be
+	 * used to compute statistics.
+	 */
+	HeapTuple* k_rows[NUM_SAMPLEKIND];
+} AcquireSampleContext;
+
 /*
  * Bitmask values for the flags argument to the scan_begin callback.
  */
@@ -532,9 +577,10 @@ typedef struct TableAmRoutine
 	 * clear what a good interface for non block based AMs would be, so there
 	 * isn't one yet.
 	 */
-	bool		(*scan_analyze_next_block) (TableScanDesc scan,
-											BlockNumber blockno,
-											BufferAccessStrategy bstrategy);
+	void		(*scan_analyze_beginscan) (Relation onerel, AcquireSampleContext *context);
+
+	bool		(*scan_analyze_next_block) (BlockNumber blockno,
+											AcquireSampleContext *context);
 
 	/*
 	 * See table_scan_analyze_next_tuple().
@@ -544,11 +590,13 @@ typedef struct TableAmRoutine
 	 * influence autovacuum scheduling (see comment for relation_vacuum
 	 * callback).
 	 */
-	bool		(*scan_analyze_next_tuple) (TableScanDesc scan,
-											TransactionId OldestXmin,
-											double *liverows,
-											double *deadrows,
-											TupleTableSlot *slot);
+	bool		(*scan_analyze_next_tuple) (TransactionId OldestXmin,
+											AcquireSampleContext *context);
+
+	void		(*scan_analyze_sample_tuple) (Index sample, bool replace,
+											  AcquireSampleContext *context);
+
+	void		(*scan_analyze_endscan) (AcquireSampleContext *context);
 
 	/* see table_index_build_range_scan for reference about parameters */
 	double		(*index_build_range_scan) (Relation table_rel,
@@ -1474,6 +1522,12 @@ table_relation_vacuum(Relation rel, struct VacuumParams *params,
 	rel->rd_tableam->relation_vacuum(rel, params, bstrategy);
 }
 
+static inline void
+table_scan_analyze_beginscan(Relation rel, struct AcquireSampleContext *context)
+{
+	rel->rd_tableam->scan_analyze_beginscan(rel, context);
+}
+
 /*
  * Prepare to analyze block `blockno` of `scan`. The scan needs to have been
  * started with table_beginscan_analyze().  Note that this routine might
@@ -1483,11 +1537,10 @@ table_relation_vacuum(Relation rel, struct VacuumParams *params,
  * Returns false if block is unsuitable for sampling, true otherwise.
  */
 static inline bool
-table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
-							  BufferAccessStrategy bstrategy)
+table_scan_analyze_next_block(BlockNumber blockno,
+							  struct AcquireSampleContext *context)
 {
-	return scan->rs_rd->rd_tableam->scan_analyze_next_block(scan, blockno,
-															bstrategy);
+	return context->scan->rs_rd->rd_tableam->scan_analyze_next_block(blockno, context);
 }
 
 /*
@@ -1501,13 +1554,21 @@ table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
  * tuples.
  */
 static inline bool
-table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
-							  double *liverows, double *deadrows,
-							  TupleTableSlot *slot)
+table_scan_analyze_next_tuple(TransactionId OldestXmin, AcquireSampleContext *context)
+{
+	return context->scan->rs_rd->rd_tableam->scan_analyze_next_tuple(OldestXmin, context);
+}
+
+static inline void 
+table_scan_analyze_sample_tuple(Index sample, bool replace, AcquireSampleContext *context)
+{
+	context->scan->rs_rd->rd_tableam->scan_analyze_sample_tuple(sample, replace, context);
+}
+
+static inline void
+table_scan_analyze_endscan(AcquireSampleContext *context)
 {
-	return scan->rs_rd->rd_tableam->scan_analyze_next_tuple(scan, OldestXmin,
-															liverows, deadrows,
-															slot);
+	context->scan->rs_rd->rd_tableam->scan_analyze_endscan(context);
 }
 
 /*
@@ -1783,6 +1844,24 @@ extern void table_block_relation_estimate_size(Relation rel,
 											   Size usable_bytes_per_page);
 
 /* ----------------------------------------------------------------------------
+ * Helper functions to implement analyze scan. 
+j* ----------------------------------------------------------------------------
+ */
+extern AcquireSampleContext *
+CreateAcquireSampleContext(Relation onerel,
+						   List *cols,
+						   int targrows,
+						   bool inh,
+						   BufferAccessStrategy strategy);
+extern void DestroyAcquireSampleContext(AcquireSampleContext *context);
+extern void InitSampleKind(AcquireSampleContext *context,
+						   Relation onerel, AcquireSampleKind kind);
+extern void RecordSampleKindRow(AcquireSampleContext *context,
+								AcquireSampleKind kind, Index sample,
+								bool replace, bool withtid);
+
+
+/* ----------------------------------------------------------------------------
  * Functions in tableamapi.c
  * ----------------------------------------------------------------------------
  */
diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h
index 8226860..4d14a81 100644
--- a/src/include/foreign/fdwapi.h
+++ b/src/include/foreign/fdwapi.h
@@ -18,6 +18,7 @@
 
 /* To avoid including explain.h here, reference ExplainState thus: */
 struct ExplainState;
+struct AcquireSampleContext;
 
 
 /*
@@ -140,9 +141,7 @@ typedef void (*ExplainDirectModify_function) (ForeignScanState *node,
 											  struct ExplainState *es);
 
 typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel,
-									  HeapTuple *rows, int targrows,
-									  double *totalrows,
-									  double *totaldeadrows);
+									  struct AcquireSampleContext *context);
 
 typedef bool (*AnalyzeForeignTable_function) (Relation relation,
 											  AcquireSampleRowsFunc *func,
-- 
1.8.3.1

From 1c7daec661f2b6e7ac388e4b74efd119945d6421 Mon Sep 17 00:00:00 2001
From: Pengzhou Tang <ptang@pivotal.io>
Date: Wed, 20 Nov 2019 06:43:33 -0500
Subject: [PATCH 2/3] Planner can estimate the pages based on the columns
 selected

Planner used to assume we need to scan all the pages even we
only need one or two columns in a query, this is right for
heap tables, however, if we using a column store like
zedstore, we can optimize the number of pages with only
selected columns, this will reduce the IO cost and the number
of parallel workers in some cases.

To do this, this commit added a new field `stadiskfrac` in
catalog `pg_statistic`, it records the fraction of physical
size that a column used comparing to the whole table. planer
will calculate a pages selectivity based on the targetlist
and baserestriction info, then scale it with the rel->pages
got from estimate_rel_size().
---
 src/backend/commands/analyze.c        | 48 +++++++++++++++++++
 src/backend/optimizer/path/allpaths.c | 87 +++++++++++++++++++++++++++++++++--
 src/include/catalog/catversion.h      |  2 +-
 src/include/catalog/pg_statistic.h    |  3 ++
 src/include/commands/vacuum.h         |  6 +++
 5 files changed, 142 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index a97297b..f8fce9c 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -87,6 +87,9 @@ static void do_analyze_rel(Relation onerel,
 						   VacuumParams *params, List *va_cols,
 						   AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
 						   bool inh, bool in_outer_xact, int elevel);
+static void compute_disk_stats(VacAttrStats **stats, int natts,
+							   TupleDesc desc, HeapTuple *rows,
+							   int numrows);
 static void compute_index_stats(Relation onerel, double totalrows,
 								AnlIndexData *indexdata, int nindexes,
 								HeapTuple *rows, int numrows,
@@ -560,6 +563,15 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 			MemoryContextResetAndDeleteChildren(col_context);
 		}
 
+		if (!va_cols && sample_context->k_slots[SAMPLE_KIND_DISKSIZE])
+		{
+			TupleDesc tupdesc =
+				sample_context->k_slots[SAMPLE_KIND_DISKSIZE]->tts_tupleDescriptor;
+			HeapTuple *rows = sample_context->k_rows[SAMPLE_KIND_DISKSIZE];
+
+			compute_disk_stats(vacattrstats, attr_cnt, tupdesc, rows, numrows);
+		}
+
 		if (hasindex)
 			compute_index_stats(onerel, totalrows,
 								indexdata, nindexes,
@@ -705,6 +717,41 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 	anl_context = NULL;
 }
 
+static void
+compute_disk_stats(VacAttrStats **stats, int natts,
+				   TupleDesc desc, HeapTuple *rows,
+				   int numrows)
+{
+	int		i, j;
+	float8	attr_size = 0;
+	float8	total = 0;
+	bool	isNull;
+
+	for (i = 0; i < numrows; i++)
+	{
+		HeapTuple tup = rows[i];
+
+		for (j = 0; j < natts; j++)
+		{
+			VacAttrStats *vac = stats[j];
+			Datum dat = heap_getattr(tup, j + 1, desc, &isNull);
+
+			if (!isNull)
+			{
+				attr_size = DatumGetFloat8(dat);
+				vac->disksize += attr_size;
+				total += attr_size;
+			}
+		}
+	}
+
+	for (j = 0; j < natts; j++)
+	{
+		VacAttrStats *vac = stats[j];
+		vac->stadiskfrac = vac->disksize / total;
+	}
+}
+
 /*
  * Compute statistics about indexes of a relation
  */
@@ -1425,6 +1472,7 @@ update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats)
 		values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(stats->attr->attnum);
 		values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(inh);
 		values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac);
+		values[Anum_pg_statistic_stadiskfrac - 1] = Float4GetDatum(stats->stadiskfrac);
 		values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth);
 		values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct);
 		i = Anum_pg_statistic_stakind1 - 1;
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index db3a68a..5df1466 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -23,6 +23,7 @@
 #include "catalog/pg_class.h"
 #include "catalog/pg_operator.h"
 #include "catalog/pg_proc.h"
+#include "catalog/pg_statistic.h"
 #include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
@@ -47,6 +48,7 @@
 #include "partitioning/partbounds.h"
 #include "partitioning/partprune.h"
 #include "rewrite/rewriteManip.h"
+#include "utils/syscache.h"
 #include "utils/lsyscache.h"
 
 
@@ -79,7 +81,11 @@ static void set_rel_size(PlannerInfo *root, RelOptInfo *rel,
 static void set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
 							 Index rti, RangeTblEntry *rte);
 static void set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel,
-							   RangeTblEntry *rte);
+							   Index rti, RangeTblEntry *rte);
+static void set_plain_rel_page_estimates(PlannerInfo *root,
+										 RelOptInfo *rel,
+										 Index rti,
+										 RangeTblEntry *rte);
 static void create_plain_partial_paths(PlannerInfo *root, RelOptInfo *rel);
 static void set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel,
 									  RangeTblEntry *rte);
@@ -409,7 +415,7 @@ set_rel_size(PlannerInfo *root, RelOptInfo *rel,
 				else
 				{
 					/* Plain relation */
-					set_plain_rel_size(root, rel, rte);
+					set_plain_rel_size(root, rel, rti, rte);
 				}
 				break;
 			case RTE_SUBQUERY:
@@ -571,7 +577,7 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
  *	  Set size estimates for a plain relation (no subquery, no inheritance)
  */
 static void
-set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
+set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte)
 {
 	/*
 	 * Test any partial indexes of rel for applicability.  We must do this
@@ -581,6 +587,81 @@ set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 
 	/* Mark rel with estimated output rows, width, etc */
 	set_baserel_size_estimates(root, rel);
+
+	/* Estimate the pages based on the selected columns */
+	set_plain_rel_page_estimates(root, rel, rti, rte);
+}
+
+static void
+set_plain_rel_page_estimates(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte)
+{
+	Var			*var;
+	List		*vars;
+	double		pages;
+	ListCell	*lc;
+	ListCell	*lc1;
+	Bitmapset	*cols = NULL;
+	HeapTuple	tp;
+	AttrNumber	attno;
+	Selectivity sel = 0;
+
+	Assert(rel->rtekind == RTE_RELATION);
+
+	foreach(lc, rel->reltarget->exprs)
+	{
+		Node *node;
+		node = lfirst(lc);
+		vars = pull_var_clause(node,
+							   PVC_RECURSE_AGGREGATES |
+							   PVC_RECURSE_WINDOWFUNCS |
+							   PVC_RECURSE_PLACEHOLDERS);
+		foreach(lc1, vars)
+		{
+			var = lfirst(lc1);
+			if (var->varno == rti && var->varattno >= 0)
+				cols = bms_add_member(cols, var->varattno);
+		}
+	}
+
+	foreach(lc, rel->baserestrictinfo)
+	{
+		RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
+		vars = pull_var_clause((Node *)rinfo->clause,
+									 PVC_RECURSE_AGGREGATES |
+									 PVC_RECURSE_WINDOWFUNCS |
+									 PVC_RECURSE_PLACEHOLDERS);
+		foreach(lc1, vars)
+		{
+			var = lfirst(lc1);
+			if (var->varno == rti && var->varattno >= 0)
+				cols = bms_add_member(cols, var->varattno);
+		}
+	}
+
+	attno = -1;
+	while ((attno = bms_next_member(cols, attno)) >= 0)
+	{
+		tp = SearchSysCache3(STATRELATTINH,
+							 ObjectIdGetDatum(rte->relid),
+							 Int16GetDatum(attno),
+							 BoolGetDatum(rte->inh));
+
+		if (HeapTupleIsValid(tp))
+		{
+			sel += ((Form_pg_statistic) GETSTRUCT(tp))->stadiskfrac;
+			ReleaseSysCache(tp);
+		}
+	}
+
+	if (sel > 0)
+	{
+		pages = rel->pages * sel;
+
+		if (pages <= 1.0)
+			rel->pages = 1;
+		else
+			rel->pages = rint(pages);
+	}
 }
 
 /*
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 1f6de76..1c14c6b 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201910251
+#define CATALOG_VERSION_NO	201912041
 
 #endif
diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h
index 207be54..66029f6 100644
--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -36,6 +36,9 @@ CATALOG(pg_statistic,2619,StatisticRelationId)
 	/* the fraction of the column's entries that are NULL: */
 	float4		stanullfrac;
 
+	/* the fraction of the column's disksize of all columns */
+	float4		stadiskfrac;
+
 	/*
 	 * stawidth is the average width in bytes of non-null entries.  For
 	 * fixed-width datatypes this is of course the same as the typlen, but for
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 128f7ae..077a3c1 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -114,6 +114,12 @@ typedef struct VacAttrStats
 	Datum	   *stavalues[STATISTIC_NUM_SLOTS];
 
 	/*
+	 * These fields are to be filled in compute_disk_stats
+	 */
+	float4		stadiskfrac;	/* fraction of the physical size */
+	float8		disksize;		/* value of the physical size */
+
+	/*
 	 * These fields describe the stavalues[n] element types. They will be
 	 * initialized to match attrtypid, but a custom typanalyze function might
 	 * want to store an array of something other than the analyzed column's
-- 
1.8.3.1

From 59b4aa28e60053e030d740a7df4811b4efa21b98 Mon Sep 17 00:00:00 2001
From: Pengzhou Tang <ptang@pivotal.io>
Date: Wed, 20 Nov 2019 06:59:22 -0500
Subject: [PATCH 3/3] ZedStore use extended ANAlYZE API

1) use the logical block ID in ANALYZE
2) provide disksize info per column when ANALYZE, so
   planner can estimate the pages need to scan based
   on columns selected.
3) can only analyze the columns specified
---
 src/backend/access/zedstore/zedstore_attstream.c |   7 +-
 src/backend/access/zedstore/zedstoream_handler.c | 117 ++++++++++++++++++++---
 src/include/access/zedstore_internal.h           |   4 +
 3 files changed, 114 insertions(+), 14 deletions(-)

diff --git a/src/backend/access/zedstore/zedstore_attstream.c b/src/backend/access/zedstore/zedstore_attstream.c
index c0c24e7..0189964 100644
--- a/src/backend/access/zedstore/zedstore_attstream.c
+++ b/src/backend/access/zedstore/zedstore_attstream.c
@@ -166,6 +166,7 @@ decode_attstream_begin(attstream_decoder *decoder, ZSAttStream *attstream)
 					  attstream->t_size - SizeOfZSAttStreamHeader,
 					  attstream->t_decompressed_bufsize);
 		decoder->chunks_len = attstream->t_decompressed_size;
+		decoder->compression_ratio = ((float8) buf_size_needed) / attstream->t_size;
 	}
 	else
 	{
@@ -173,6 +174,7 @@ decode_attstream_begin(attstream_decoder *decoder, ZSAttStream *attstream)
 			   ((char *) attstream) + SizeOfZSAttStreamHeader,
 			   attstream->t_size - SizeOfZSAttStreamHeader);
 		decoder->chunks_len = attstream->t_size - SizeOfZSAttStreamHeader;
+		decoder->compression_ratio = 1.0;
 	}
 	decoder->firsttid = get_chunk_first_tid(decoder->attlen, decoder->chunks_buf);
 	decoder->lasttid = attstream->t_lasttid;
@@ -181,6 +183,7 @@ decode_attstream_begin(attstream_decoder *decoder, ZSAttStream *attstream)
 	decoder->prevtid = 0;
 
 	decoder->num_elements = 0;
+	decoder->avg_elements_size = 0;
 }
 
 /*
@@ -226,6 +229,7 @@ decode_attstream_cont(attstream_decoder *decoder)
 	zstid		lasttid;
 	int			total_decoded;
 	char	   *p;
+	char	   *lastp;
 	char	   *pend;
 	MemoryContext oldcxt;
 
@@ -236,7 +240,7 @@ decode_attstream_cont(attstream_decoder *decoder)
 		MemoryContextSwitchTo(decoder->tmpcxt);
 	}
 
-	p = decoder->chunks_buf + decoder->pos;
+	lastp = p = decoder->chunks_buf + decoder->pos;
 	pend = decoder->chunks_buf + decoder->chunks_len;
 
 	total_decoded = 0;
@@ -261,6 +265,7 @@ decode_attstream_cont(attstream_decoder *decoder)
 
 	Assert(p <= pend);
 	decoder->num_elements = total_decoded;
+	decoder->avg_elements_size = ((p - lastp) / total_decoded) / decoder->compression_ratio;
 	decoder->pos = p - decoder->chunks_buf;
 	if (total_decoded > 0)
 	{
diff --git a/src/backend/access/zedstore/zedstoream_handler.c b/src/backend/access/zedstore/zedstoream_handler.c
index 22f0773..48a479d 100644
--- a/src/backend/access/zedstore/zedstoream_handler.c
+++ b/src/backend/access/zedstore/zedstoream_handler.c
@@ -35,6 +35,7 @@
 #include "miscadmin.h"
 #include "optimizer/plancat.h"
 #include "pgstat.h"
+#include "parser/parse_relation.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
 #include "storage/procarray.h"
@@ -2424,34 +2425,109 @@ zedstoream_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 	zsbt_tuplebuffer_flush(NewHeap);
 }
 
+static void 
+zedstoream_scan_analyze_beginscan(Relation onerel, AcquireSampleContext *context)
+{
+	zstid 	tid;
+	List	*va_cols = context->cols;		
+	Bitmapset	*project_columns = NULL;	
+
+	/* zedstore can sample rows on specified columns only */
+	if (!va_cols)
+		context->scan = table_beginscan_analyze(onerel);
+	else
+	{
+		ListCell	*le;
+
+		foreach(le, va_cols)
+		{
+			char	   *col = strVal(lfirst(le));
+
+			project_columns =
+				bms_add_member(project_columns, attnameAttNum(onerel, col, false));
+		}
+
+		context->scan = 
+			zedstoream_beginscan_with_column_projection(onerel, NULL, 0, NULL,
+														NULL, SO_TYPE_ANALYZE,
+														project_columns);
+	}
+
+	/* initialize the slot to fetch sample rows */
+	InitSampleKind(context, onerel, SAMPLE_KIND_DATA);
+
+	/* zedstore also provide extra info when analyzing full columns */
+	if (!va_cols)
+		InitSampleKind(context, onerel, SAMPLE_KIND_DISKSIZE);
+
+	/* zedstore use a logical block number to acquire sample rows */
+	tid = zsbt_get_last_tid(onerel);
+	context->totalblocks = ZSTidGetBlockNumber(tid) + 1;
+}
+
 /*
- * FIXME: The ANALYZE API is problematic for us. acquire_sample_rows() calls
- * RelationGetNumberOfBlocks() directly on the relation, and chooses the
- * block numbers to sample based on that. But the logical block numbers
- * have little to do with physical ones in zedstore.
+ * Get next logical block.
  */
 static bool
-zedstoream_scan_analyze_next_block(TableScanDesc sscan, BlockNumber blockno,
-								   BufferAccessStrategy bstrategy)
+zedstoream_scan_analyze_next_block(BlockNumber blockno,
+								   AcquireSampleContext *context)
 {
-	return zs_blkscan_next_block(sscan, blockno, NULL, -1, false);
+	return zs_blkscan_next_block(context->scan, blockno, NULL, -1, false);
 }
 
 static bool
-zedstoream_scan_analyze_next_tuple(TableScanDesc sscan, TransactionId OldestXmin,
-								   double *liverows, double *deadrows,
-								   TupleTableSlot *slot)
+zedstoream_scan_analyze_next_tuple(TransactionId OldestXmin, AcquireSampleContext *context)
 {
-	bool		result;
+	int		i;
+	bool	result;
+	AttrNumber		attno;
+	TableScanDesc	scan = context->scan;
+	ZedStoreDesc	sscan = (ZedStoreDesc) scan;
+	ZSAttrTreeScan	*attr_scan;
+	TupleTableSlot	*slot = context->k_slots[SAMPLE_KIND_DATA];
 
-	result = zs_blkscan_next_tuple(sscan, slot);
+	result = zs_blkscan_next_tuple(scan, slot);
 
 	if (result)
-		(*liverows)++;
+	{
+		/* provide extra disk info when analyzing on full columns */
+		if (!context->cols)
+		{
+			slot = context->k_slots[SAMPLE_KIND_DISKSIZE];
+
+			for (i = 1; i < sscan->proj_data.num_proj_atts; i++)
+			{
+				attr_scan = &sscan->proj_data.attr_scans[i - 1];	
+				attno = sscan->proj_data.proj_atts[i];
+
+				slot->tts_values[attno - 1] =
+					Float8GetDatum(attr_scan->decoder.avg_elements_size); 
+				slot->tts_isnull[attno - 1] = false;
+				slot->tts_flags &= ~TTS_FLAG_EMPTY;
+			}
+		}
+
+		context->liverows++;
+	}
 
 	return result;
 }
 
+static void
+zedstoream_scan_analyze_sample_tuple(Index sample, bool replace, AcquireSampleContext *context)
+{
+	RecordSampleKindRow(context, SAMPLE_KIND_DATA, sample, replace, false);
+
+	if (!context->cols)
+		RecordSampleKindRow(context, SAMPLE_KIND_DISKSIZE, sample, replace, false);
+}
+
+static void
+zedstoream_scan_analyze_endscan(AcquireSampleContext *context)
+{
+	table_endscan(context->scan);
+}
+
 /* ------------------------------------------------------------------------
  * Miscellaneous callbacks for the heap AM
  * ------------------------------------------------------------------------
@@ -2717,6 +2793,18 @@ zs_blkscan_next_tuple(TableScanDesc sscan, TupleTableSlot *slot)
 
 	if (scan->bmscan_nexttuple >= scan->bmscan_ntuples)
 		return false;
+
+	/*
+	 * Initialize the slot.
+	 *
+	 * We initialize all columns to NULL. The values for columns that are projected
+	 * will be set to the actual values below, but it's important that non-projected
+	 * columns are NULL.
+	 */
+	ExecClearTuple(slot);
+	for (int i = 0; i < sscan->rs_rd->rd_att->natts; i++)
+		slot->tts_isnull[i] = true;
+
 	/*
 	 * projection attributes were created based on Relation tuple descriptor
 	 * it better match TupleTableSlot.
@@ -2939,8 +3027,11 @@ static const TableAmRoutine zedstoream_methods = {
 	.relation_copy_data = zedstoream_relation_copy_data,
 	.relation_copy_for_cluster = zedstoream_relation_copy_for_cluster,
 	.relation_vacuum = zedstoream_vacuum_rel,
+	.scan_analyze_beginscan = zedstoream_scan_analyze_beginscan,
 	.scan_analyze_next_block = zedstoream_scan_analyze_next_block,
 	.scan_analyze_next_tuple = zedstoream_scan_analyze_next_tuple,
+	.scan_analyze_sample_tuple = zedstoream_scan_analyze_sample_tuple,
+	.scan_analyze_endscan = zedstoream_scan_analyze_endscan,
 
 	.index_build_range_scan = zedstoream_index_build_range_scan,
 	.index_validate_scan = zedstoream_index_validate_scan,
diff --git a/src/include/access/zedstore_internal.h b/src/include/access/zedstore_internal.h
index dedc867..b217b4a 100644
--- a/src/include/access/zedstore_internal.h
+++ b/src/include/access/zedstore_internal.h
@@ -78,6 +78,9 @@ typedef struct
 	char	   *chunks_buf;
 	int			chunks_buf_size;
 
+	/* attstream compression ratio */
+	float8		compression_ratio;
+
 	/* information about the current attstream in the buffer */
 	int			chunks_len;
 	zstid		firsttid;
@@ -96,6 +99,7 @@ typedef struct
 	Datum		datums[DECODER_MAX_ELEMS];
 	bool		isnulls[DECODER_MAX_ELEMS];
 	int			num_elements;
+	float8		avg_elements_size; /* avg physical size of elements */
 } attstream_decoder;
 
 /*
-- 
1.8.3.1

[Proposal] Extend TableAM routines for ANALYZE scan

Reply via email to