Re: Pluggable Storage - Andres's take

Heikki Linnakangas Mon, 08 Apr 2019 05:36:59 -0700

I wrote a little toy implementation that just returns constant data toplay with this a little. Looks good overall.

There were a bunch of typos in the comments in tableam.h, see attached.Some of the comments could use more copy-editing and clarification, Ithink, but I stuck to fixing just typos and such for now.

index_update_stats() calls RelationGetNumberOfBlocks(<table>). If the AMdoesn't use normal data files, that won't work. I bumped into that withmy toy implementation, which wouldn't need to create any data files, ifit wasn't for this.

The comments for relation_set_new_relfilenode() callback say that the AMcan set *freezeXid and *minmulti to invalid. But when I did that, VACUUMhits this assertion:

TRAP: FailedAssertion("!(((classForm->relfrozenxid) >= ((TransactionId)3)))", File: "vacuum.c", Line: 1323)

There's a little bug in index-only scan executor node, where it mixes upthe slots to hold a tuple from the index, and from the table. Thatdoesn't cause any ill effects if the AM uses TTSOpsHeapTuple, but withmy toy AM, which uses a virtual slot, it caused warnings like this fromindex-only scans:

WARNING: problem in alloc set ExecutorState: detected write past chunkend in block 0x56419b0f88e8, chunk 0x56419b0f8f90

Attached is a patch with the toy implementation I used to test this.I'm not suggesting we should commit that - although feel free to do thatif you think it's useful - but it shows how I bumped into these issues.The second patch fixes the index-only-scan slot confusion (untested,except with my toy AM).


- Heikki

>From 97e0eea6a3fb123845ac5650f1aaa1802bf56694 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Mon, 8 Apr 2019 15:16:53 +0300
Subject: [PATCH 1/3] Add a toy table AM implementation to play with.

It returns a constant data set. No insert/update/delete. But you can
create indexes.
---
 src/test/modules/toytable/Makefile            |  25 +
 .../modules/toytable/expected/toytable.out    |  41 ++
 src/test/modules/toytable/sql/toytable.sql    |  17 +
 src/test/modules/toytable/toytable--1.0.sql   |  12 +
 src/test/modules/toytable/toytable.control    |   4 +
 src/test/modules/toytable/toytableam.c        | 612 ++++++++++++++++++
 6 files changed, 711 insertions(+)
 create mode 100644 src/test/modules/toytable/Makefile
 create mode 100644 src/test/modules/toytable/expected/toytable.out
 create mode 100644 src/test/modules/toytable/sql/toytable.sql
 create mode 100644 src/test/modules/toytable/toytable--1.0.sql
 create mode 100644 src/test/modules/toytable/toytable.control
 create mode 100644 src/test/modules/toytable/toytableam.c

diff --git a/src/test/modules/toytable/Makefile b/src/test/modules/toytable/Makefile
new file mode 100644
index 00000000000..142ef2d23e6
--- /dev/null
+++ b/src/test/modules/toytable/Makefile
@@ -0,0 +1,25 @@
+# src/test/modules/toytable/Makefile
+
+MODULE_big = toytable
+OBJS = toytableam.o $(WIN32RES)
+PGFILEDESC = "A dummy implementantation of the table AM API"
+
+EXTENSION = toytable
+DATA = toytable--1.0.sql
+
+REGRESS = toytable
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/toytable
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+OBJS = toytableam.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/test/modules/toytable/expected/toytable.out b/src/test/modules/toytable/expected/toytable.out
new file mode 100644
index 00000000000..3e8598e284c
--- /dev/null
+++ b/src/test/modules/toytable/expected/toytable.out
@@ -0,0 +1,41 @@
+CREATE EXTENSION toytable;
+create table toytab (i int4, j int4, k int4) using toytable;
+select * from toytab;
+ i  | j  | k  
+----+----+----
+  1 |  1 |  1
+  2 |  2 |  2
+  3 |  3 |  3
+  4 |  4 |  4
+  5 |  5 |  5
+  6 |  6 |  6
+  7 |  7 |  7
+  8 |  8 |  8
+  9 |  9 |  9
+ 10 | 10 | 10
+(10 rows)
+
+create index toyidx on toytab(i);
+-- test index scan
+set enable_seqscan=off;
+set enable_indexscan=on;
+select i, j from toytab where i = 4;
+ i | j 
+---+---
+ 4 | 4
+(1 row)
+
+-- index only scan
+explain (costs off) select i from toytab where i = 4;
+               QUERY PLAN               
+----------------------------------------
+ Index Only Scan using toyidx on toytab
+   Index Cond: (i = 4)
+(2 rows)
+
+select i from toytab where i = 4 ;
+ i 
+---
+ 4
+(1 row)
+
diff --git a/src/test/modules/toytable/sql/toytable.sql b/src/test/modules/toytable/sql/toytable.sql
new file mode 100644
index 00000000000..8d9bac41bbf
--- /dev/null
+++ b/src/test/modules/toytable/sql/toytable.sql
@@ -0,0 +1,17 @@
+CREATE EXTENSION toytable;
+
+create table toytab (i int4, j int4, k int4) using toytable;
+
+select * from toytab;
+
+create index toyidx on toytab(i);
+
+-- test index scan
+set enable_seqscan=off;
+set enable_indexscan=on;
+
+select i, j from toytab where i = 4;
+
+-- index only scan
+explain (costs off) select i from toytab where i = 4;
+select i from toytab where i = 4 ;
diff --git a/src/test/modules/toytable/toytable--1.0.sql b/src/test/modules/toytable/toytable--1.0.sql
new file mode 100644
index 00000000000..52085d27f4a
--- /dev/null
+++ b/src/test/modules/toytable/toytable--1.0.sql
@@ -0,0 +1,12 @@
+/* src/test/modules/toyam/toyam--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION toytable" to load this file. \quit
+
+CREATE FUNCTION toytableam_handler(internal)
+RETURNS pg_catalog.table_am_handler STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE ACCESS METHOD toytable TYPE TABLE HANDLER toytableam_handler
+
+
diff --git a/src/test/modules/toytable/toytable.control b/src/test/modules/toytable/toytable.control
new file mode 100644
index 00000000000..8f613e58d6e
--- /dev/null
+++ b/src/test/modules/toytable/toytable.control
@@ -0,0 +1,4 @@
+comment = 'Dummy implementation of table AM api'
+default_version = '1.0'
+module_pathname = '$libdir/toytable'
+relocatable = true
diff --git a/src/test/modules/toytable/toytableam.c b/src/test/modules/toytable/toytableam.c
new file mode 100644
index 00000000000..30b0e74e7f6
--- /dev/null
+++ b/src/test/modules/toytable/toytableam.c
@@ -0,0 +1,612 @@
+/*-------------------------------------------------------------------------
+ *
+ * toyam_handler.c
+ *	  a toy table access method code
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/toytable/toyam_handler.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "miscadmin.h"
+
+#include "access/multixact.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "catalog/catalog.h"
+#include "catalog/storage.h"
+#include "catalog/index.h"
+#include "catalog/pg_type.h"
+#include "executor/executor.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "storage/bufmgr.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(toytableam_handler);
+
+typedef struct
+{
+	TableScanDescData scan;
+
+	int			tupidx;
+} ToyScanDescData;
+typedef ToyScanDescData *ToyScanDesc;
+
+static const TupleTableSlotOps *
+toyam_slot_callbacks(Relation relation)
+{
+	return &TTSOpsVirtual;
+}
+
+static TableScanDesc toyam_scan_begin(Relation rel,
+							 Snapshot snapshot,
+							 int nkeys, struct ScanKeyData *key,
+							 ParallelTableScanDesc pscan,
+							 bool allow_strat,
+							 bool allow_sync,
+							 bool allow_pagemode,
+							 bool is_bitmapscan,
+							 bool is_samplescan,
+							 bool temp_snap)
+{
+	ToyScanDesc tscan;
+
+	tscan = palloc0(sizeof(ToyScanDescData));
+	tscan->scan.rs_rd = rel;
+	tscan->scan.rs_snapshot = snapshot;
+	tscan->scan.rs_nkeys = nkeys;
+	tscan->scan.rs_bitmapscan = is_bitmapscan;
+	tscan->scan.rs_samplescan = is_samplescan;
+	tscan->scan.rs_allow_strat = allow_strat;
+	tscan->scan.rs_allow_sync = allow_sync;
+	tscan->scan.rs_temp_snap = temp_snap;
+	tscan->scan.rs_parallel = pscan;
+
+	tscan->tupidx = 0;
+
+	return &tscan->scan;
+}
+
+static void
+toyam_scan_end(TableScanDesc scan)
+{
+	pfree(scan);
+}
+
+static void
+toyam_scan_rescan(TableScanDesc scan, struct ScanKeyData *key,
+				  bool set_params, bool allow_strat,
+				  bool allow_sync, bool allow_pagemode)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static bool
+toyam_scan_getnextslot(TableScanDesc scan,
+					   ScanDirection direction,
+					   TupleTableSlot *slot)
+{
+	ToyScanDesc tscan = (ToyScanDesc) scan;
+
+	slot->tts_nvalid = 0;
+	slot->tts_flags |= TTS_FLAG_EMPTY;
+
+	/*
+	 * Return a constant 1 rows. Every int4 attribute gets
+	 * a running count, everything else is NULL.
+	 */
+	if (tscan->tupidx < 10)
+	{
+		TupleDesc desc = RelationGetDescr(tscan->scan.rs_rd);
+
+		tscan->tupidx++;
+
+		for (AttrNumber attno = 1; attno <= desc->natts; attno++)
+		{
+			Form_pg_attribute att = &desc->attrs[attno - 1];
+			Datum		d;
+			bool		isnull;
+
+			if (att->atttypid == INT4OID)
+			{
+				d = tscan->tupidx;
+				isnull = false;
+			}
+			else
+			{
+				d = (Datum) 0;
+				isnull = true;
+			}
+
+			slot->tts_values[attno - 1] = d;
+			slot->tts_isnull[attno - 1] = isnull;
+		}
+
+		ItemPointerSet(&slot->tts_tid, 1, tscan->tupidx);
+		slot->tts_nvalid = slot->tts_tupleDescriptor->natts;
+		slot->tts_flags &= ~TTS_FLAG_EMPTY;
+
+		return true;
+	}
+	else
+		return false;
+}
+
+static Size
+toyam_parallelscan_estimate(Relation rel)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static Size
+toyam_parallelscan_initialize(Relation rel,
+							  ParallelTableScanDesc pscan)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static void
+toyam_parallelscan_reinitialize(Relation rel,
+								ParallelTableScanDesc pscan)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static struct IndexFetchTableData *
+toyam_index_fetch_begin(Relation rel)
+{
+	IndexFetchTableData *tfetch = palloc0(sizeof(IndexFetchTableData));
+
+	tfetch->rel = rel;
+
+	return tfetch;
+}
+
+static void
+toyam_index_fetch_reset(struct IndexFetchTableData *data)
+{
+}
+
+static void
+toyam_index_fetch_end(struct IndexFetchTableData *data)
+{
+	pfree(data);
+}
+
+static bool
+toyam_index_fetch_tuple(struct IndexFetchTableData *scan,
+						ItemPointer tid,
+						Snapshot snapshot,
+						TupleTableSlot *slot,
+						bool *call_again, bool *all_dead)
+{
+	TupleDesc desc = RelationGetDescr(scan->rel);
+	int			tupidx;
+
+	if (ItemPointerGetBlockNumber(tid) != 1)
+		return false;
+
+	tupidx = ItemPointerGetOffsetNumber(tid);
+	if (tupidx < 1 || tupidx > 10)
+		return false;
+
+	slot->tts_nvalid = 0;
+	slot->tts_flags |= TTS_FLAG_EMPTY;
+
+	/* Return same data as toyam_scan_getnextslot does */
+	for (AttrNumber attno = 1; attno <= desc->natts; attno++)
+	{
+		Form_pg_attribute att = &desc->attrs[attno - 1];
+		Datum		d;
+		bool		isnull;
+
+		if (att->atttypid == INT4OID)
+		{
+			d = tupidx;
+			isnull = false;
+		}
+		else
+		{
+			d = (Datum) 0;
+			isnull = true;
+		}
+
+		slot->tts_values[attno - 1] = d;
+		slot->tts_isnull[attno - 1] = isnull;
+	}
+
+	ItemPointerSet(&slot->tts_tid, 1, tupidx);
+	slot->tts_nvalid = slot->tts_tupleDescriptor->natts;
+	slot->tts_flags &= ~TTS_FLAG_EMPTY;
+
+	return true;
+}
+
+static bool
+toyam_tuple_fetch_row_version(Relation rel,
+							  ItemPointer tid,
+							  Snapshot snapshot,
+							  TupleTableSlot *slot)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static void
+toyam_tuple_get_latest_tid(Relation rel,
+						   Snapshot snapshot,
+						   ItemPointer tid)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static bool
+toyam_tuple_satisfies_snapshot(Relation rel,
+							   TupleTableSlot *slot,
+							   Snapshot snapshot)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static TransactionId
+toyam_compute_xid_horizon_for_tuples(Relation rel,
+									 ItemPointerData *items,
+									 int nitems)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static void
+toyam_tuple_insert(Relation rel, TupleTableSlot *slot,
+				   CommandId cid, int options,
+				   struct BulkInsertStateData *bistate)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static void
+toyam_tuple_insert_speculative(Relation rel,
+							   TupleTableSlot *slot,
+							   CommandId cid,
+							   int options,
+							   struct BulkInsertStateData *bistate,
+							   uint32 specToken)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static void
+toyam_tuple_complete_speculative(Relation rel,
+								 TupleTableSlot *slot,
+								 uint32 specToken,
+								 bool succeeded)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static TM_Result
+toyam_tuple_delete(Relation rel,
+				   ItemPointer tid,
+				   CommandId cid,
+				   Snapshot snapshot,
+				   Snapshot crosscheck,
+				   bool wait,
+				   TM_FailureData *tmfd,
+				   bool changingPart)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static void
+toyam_multi_insert(Relation rel, TupleTableSlot **slots, int nslots,
+				   CommandId cid, int options, struct BulkInsertStateData *bistate)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static TM_Result
+toyam_tuple_update(Relation rel,
+				   ItemPointer otid,
+				   TupleTableSlot *slot,
+				   CommandId cid,
+				   Snapshot snapshot,
+				   Snapshot crosscheck,
+				   bool wait,
+				   TM_FailureData *tmfd,
+				   LockTupleMode *lockmode,
+				   bool *update_indexes)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static TM_Result
+toyam_tuple_lock(Relation rel,
+				 ItemPointer tid,
+				 Snapshot snapshot,
+				 TupleTableSlot *slot,
+				 CommandId cid,
+				 LockTupleMode mode,
+				 LockWaitPolicy wait_policy,
+				 uint8 flags,
+				 TM_FailureData *tmfd)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static void
+toyam_finish_bulk_insert(Relation rel, int options)
+{
+	return;
+}
+
+static void
+toyam_relation_set_new_filenode(Relation rel,
+								char persistence,
+								TransactionId *freezeXid,
+								MultiXactId *minmulti)
+{
+	*freezeXid = InvalidTransactionId;
+	*minmulti = InvalidMultiXactId;
+
+	/*
+	 * FIXME: We don't need this for anything. But index build calls
+	 * RelationGetNumberOfBlocks, from index_update_stats(), and that
+	 * fails if the underlying file doesn't exist.
+	 */
+	RelationCreateStorage(rel->rd_node, persistence);
+}
+
+static void
+toyam_relation_nontransactional_truncate(Relation rel)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static void
+toyam_relation_copy_data(Relation rel, RelFileNode newrnode)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static void
+toyam_relation_copy_for_cluster(Relation NewHeap,
+								Relation OldHeap,
+								Relation OldIndex,
+								bool use_sort,
+								TransactionId OldestXmin,
+								TransactionId FreezeXid,
+								MultiXactId MultiXactCutoff,
+								double *num_tuples,
+								double *tups_vacuumed,
+								double *tups_recently_dead)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static void
+toyam_relation_vacuum(Relation onerel,
+					  struct VacuumParams *params,
+					  BufferAccessStrategy bstrategy)
+{
+	/* we've got nothing to do */
+}
+
+static bool
+toyam_scan_analyze_next_block(TableScanDesc scan,
+							  BlockNumber blockno,
+							  BufferAccessStrategy bstrategy)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static bool
+toyam_scan_analyze_next_tuple(TableScanDesc scan,
+							  TransactionId OldestXmin,
+							  double *liverows,
+							  double *deadrows,
+							  TupleTableSlot *slot)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static double
+toyam_index_build_range_scan(Relation heap_rel,
+							 Relation index_rel,
+							 struct IndexInfo *index_nfo,
+							 bool allow_sync,
+							 bool anyvisible,
+							 bool progress,
+							 BlockNumber start_blockno,
+							 BlockNumber end_blockno,
+							 IndexBuildCallback callback,
+							 void *callback_state,
+							 TableScanDesc scan)
+{
+	TupleTableSlot *slot;
+	EState     *estate;
+
+	estate = CreateExecutorState();
+	slot = table_slot_create(heap_rel, NULL);
+
+	if (!scan)
+		scan = toyam_scan_begin(heap_rel,
+								SnapshotAny,
+								0, NULL,
+								NULL,
+								false,
+								false,
+								false,
+								false,
+								false,
+								false);
+
+	while (toyam_scan_getnextslot(scan, ForwardScanDirection, slot))
+	{
+		Datum           values[INDEX_MAX_KEYS];
+		bool            isnull[INDEX_MAX_KEYS];
+		HeapTuple		heapTuple;
+
+		FormIndexDatum(index_nfo, slot, estate, values, isnull);
+
+		/* Call the AM's callback routine to process the tuple */
+		heapTuple = ExecCopySlotHeapTuple(slot);
+		heapTuple->t_self = slot->tts_tid;
+		callback(heap_rel, heapTuple, values, isnull, true,
+				 callback_state);
+		pfree(heapTuple);
+	}
+
+	toyam_scan_end(scan);
+	ExecDropSingleTupleTableSlot(slot);
+	FreeExecutorState(estate);
+
+	return 10;
+}
+
+static void
+toyam_index_validate_scan(Relation heap_rel,
+						  Relation index_rel,
+						  struct IndexInfo *index_info,
+						  Snapshot snapshot,
+						  struct ValidateIndexState *state)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static void
+toyam_relation_estimate_size(Relation rel, int32 *attr_widths,
+							 BlockNumber *pages, double *tuples,
+							 double *allvisfrac)
+{
+	*pages = 1;
+	*tuples = 1;
+	*allvisfrac = 1.0;
+}
+
+static bool
+toyam_scan_sample_next_block(TableScanDesc scan,
+							 struct SampleScanState *scanstate)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static bool
+toyam_scan_sample_next_tuple(TableScanDesc scan,
+					   struct SampleScanState *scanstate,
+					   TupleTableSlot *slot)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("function %s not implemented yet", __func__)));
+}
+
+static const TableAmRoutine toyam_methods = {
+	.type = T_TableAmRoutine,
+
+	.slot_callbacks = toyam_slot_callbacks,
+
+	.scan_begin = toyam_scan_begin,
+	.scan_end = toyam_scan_end,
+	.scan_rescan = toyam_scan_rescan,
+	.scan_getnextslot = toyam_scan_getnextslot,
+
+	.parallelscan_estimate = toyam_parallelscan_estimate,
+	.parallelscan_initialize = toyam_parallelscan_initialize,
+	.parallelscan_reinitialize = toyam_parallelscan_reinitialize,
+
+	.index_fetch_begin = toyam_index_fetch_begin,
+	.index_fetch_reset = toyam_index_fetch_reset,
+	.index_fetch_end = toyam_index_fetch_end,
+	.index_fetch_tuple = toyam_index_fetch_tuple,
+
+	.tuple_fetch_row_version = toyam_tuple_fetch_row_version,
+	.tuple_get_latest_tid = toyam_tuple_get_latest_tid,
+	.tuple_satisfies_snapshot = toyam_tuple_satisfies_snapshot,
+	.compute_xid_horizon_for_tuples = toyam_compute_xid_horizon_for_tuples,
+
+	.tuple_insert = toyam_tuple_insert,
+	.tuple_insert_speculative = toyam_tuple_insert_speculative,
+	.tuple_complete_speculative = toyam_tuple_complete_speculative,
+	.multi_insert = toyam_multi_insert,
+	.tuple_delete = toyam_tuple_delete,
+	.tuple_update = toyam_tuple_update,
+	.tuple_lock = toyam_tuple_lock,
+	.finish_bulk_insert = toyam_finish_bulk_insert,
+
+	.relation_set_new_filenode = toyam_relation_set_new_filenode,
+	.relation_nontransactional_truncate = toyam_relation_nontransactional_truncate,
+	.relation_copy_data = toyam_relation_copy_data,
+	.relation_copy_for_cluster = toyam_relation_copy_for_cluster,
+	.relation_vacuum = toyam_relation_vacuum,
+
+	.scan_analyze_next_block = toyam_scan_analyze_next_block,
+	.scan_analyze_next_tuple = toyam_scan_analyze_next_tuple,
+	.index_build_range_scan = toyam_index_build_range_scan,
+	.index_validate_scan = toyam_index_validate_scan,
+
+	.relation_estimate_size = toyam_relation_estimate_size,
+
+	.scan_bitmap_next_block = NULL,
+	.scan_bitmap_next_tuple = NULL,
+	.scan_sample_next_block = toyam_scan_sample_next_block,
+	.scan_sample_next_tuple = toyam_scan_sample_next_tuple,
+};
+
+Datum
+toytableam_handler(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_POINTER(&toyam_methods);
+}
-- 
2.20.1

>From b329e4345731cd84708e5efcc51e3d5298c27bb2 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Mon, 8 Apr 2019 15:18:19 +0300
Subject: [PATCH 2/3] Fix confusion on different kinds of slots in
 IndexOnlyScans.

We used the same slot, to store a tuple from the index, and to store a
tuple from the table. That's not OK. It worked with the heap, because
heapam_getnextslot() stores a HeapTuple to the slot, and doesn't care how
large the tts_values/nulls arrays are. But when I played with a toy table
AM implementation that used a virtual tuple, it caused memory overruns.
---
 src/backend/executor/nodeIndexonlyscan.c | 16 +++++++++++++---
 src/include/nodes/execnodes.h            |  1 +
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 7711728495c..5833d683b38 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -166,10 +166,10 @@ IndexOnlyNext(IndexOnlyScanState *node)
 			 * Rats, we have to visit the heap to check visibility.
 			 */
 			InstrCountTuples2(node, 1);
-			if (!index_fetch_heap(scandesc, slot))
+			if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
 				continue;		/* no visible tuple, try next index entry */
 
-			ExecClearTuple(slot);
+			ExecClearTuple(node->ioss_TableSlot);
 
 			/*
 			 * Only MVCC snapshots are supported here, so there should be no
@@ -528,7 +528,17 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	 */
 	tupDesc = ExecTypeFromTL(node->indextlist);
 	ExecInitScanTupleSlot(estate, &indexstate->ss, tupDesc,
-						  table_slot_callbacks(currentRelation));
+						  &TTSOpsVirtual);
+
+	/*
+	 * We need another slot, in a format that's suitable for the table AM,
+	 * for when we need to fetch a tuple from the table for rechecking
+	 * visibility.
+	 */
+	indexstate->ioss_TableSlot =
+		ExecAllocTableSlot(&estate->es_tupleTable,
+						   RelationGetDescr(currentRelation),
+						   table_slot_callbacks(currentRelation));
 
 	/*
 	 * Initialize result type and projection info.  The node's targetlist will
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index a5e4b7ef2e0..108dee61e24 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1424,6 +1424,7 @@ typedef struct IndexOnlyScanState
 	struct IndexScanDescData *ioss_ScanDesc;
 	Buffer		ioss_VMBuffer;
 	Size		ioss_PscanLen;
+	TupleTableSlot *ioss_TableSlot;
 } IndexOnlyScanState;
 
 /* ----------------
-- 
2.20.1

>From 213e33f92532201d0d278394cac7ffcaf0dccafa Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Mon, 8 Apr 2019 15:28:00 +0300
Subject: [PATCH 3/3] Fix typos and grammar in tableam.h comments.

---
 src/include/access/tableam.h | 119 +++++++++++++++++------------------
 1 file changed, 59 insertions(+), 60 deletions(-)

diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 51398f35c01..ab80919f8d0 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -26,6 +26,7 @@
 
 #define DEFAULT_TABLE_ACCESS_METHOD	"heap"
 
+/* GUCs */
 extern char *default_table_access_method;
 extern bool synchronize_seqscans;
 
@@ -40,7 +41,7 @@ struct ValidateIndexState;
 
 
 /*
- * Result codes for table_{update,delete,lock}_tuple, and for visibility
+ * Result codes for table_{update,delete,lock_tuple}, and for visibility
  * routines inside table AMs.
  */
 typedef enum TM_Result
@@ -68,8 +69,8 @@ typedef enum TM_Result
 
 	/*
 	 * The affected tuple is currently being modified by another session. This
-	 * will only be returned if (update/delete/lock)_tuple are instructed not
-	 * to wait.
+	 * will only be returned if table_(update/delete/lock_tuple) are instructed
+	 * not to wait.
 	 */
 	TM_BeingModified,
 
@@ -82,12 +83,15 @@ typedef enum TM_Result
  * When table_update, table_delete, or table_lock_tuple fail because the target
  * tuple is already outdated, they fill in this struct to provide information
  * to the caller about what happened.
+ *
  * ctid is the target's ctid link: it is the same as the target's TID if the
  * target was deleted, or the location of the replacement tuple if the target
  * was updated.
+ *
  * xmax is the outdating transaction's XID.  If the caller wants to visit the
  * replacement tuple, it must check that this matches before believing the
  * replacement is really a match.
+ *
  * cmax is the outdating command's CID, but only when the failure code is
  * TM_SelfModified (i.e., something in the current transaction outdated the
  * tuple); otherwise cmax is zero.  (We make this restriction because
@@ -108,10 +112,10 @@ typedef struct TM_FailureData
 #define TABLE_INSERT_FROZEN			0x0004
 #define TABLE_INSERT_NO_LOGICAL		0x0008
 
-/* flag bits fortable_lock_tuple */
+/* flag bits for table_lock_tuple */
 /* Follow tuples whose update is in progress if lock modes don't conflict  */
 #define TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS	(1 << 0)
-/* Follow update chain and lock lastest version of tuple */
+/* Follow update chain and lock latest version of tuple */
 #define TUPLE_LOCK_FLAG_FIND_LAST_VERSION		(1 << 1)
 
 
@@ -128,8 +132,8 @@ typedef void (*IndexBuildCallback) (Relation index,
  * server-lifetime manner, typically as a static const struct, which then gets
  * returned by FormData_pg_am.amhandler.
  *
- * I most cases it's not appropriate to directly call the callbacks directly,
- * instead use the table_* wrapper functions.
+ * In most cases it's not appropriate to call the callbacks directly, use the
+ * table_* wrapper functions instead.
  *
  * GetTableAmRoutine() asserts that required callbacks are filled in, remember
  * to update when adding a callback.
@@ -194,7 +198,7 @@ typedef struct TableAmRoutine
 	void		(*scan_end) (TableScanDesc scan);
 
 	/*
-	 * Restart relation scan.  If set_params is set to true, allow{strat,
+	 * Restart relation scan.  If set_params is set to true, allow_{strat,
 	 * sync, pagemode} (see scan_begin) changes should be taken into account.
 	 */
 	void		(*scan_rescan) (TableScanDesc scan, struct ScanKeyData *key,
@@ -222,7 +226,7 @@ typedef struct TableAmRoutine
 
 	/*
 	 * Initialize ParallelTableScanDesc for a parallel scan of this relation.
-	 * pscan will be sized according to parallelscan_estimate() for the same
+	 * `pscan` will be sized according to parallelscan_estimate() for the same
 	 * relation.
 	 */
 	Size		(*parallelscan_initialize) (Relation rel,
@@ -243,7 +247,7 @@ typedef struct TableAmRoutine
 
 	/*
 	 * Prepare to fetch tuples from the relation, as needed when fetching
-	 * tuples for an index scan.  The callback has to return a
+	 * tuples for an index scan.  The callback has to return an
 	 * IndexFetchTableData, which the AM will typically embed in a larger
 	 * structure with additional information.
 	 *
@@ -268,16 +272,16 @@ typedef struct TableAmRoutine
 	 * test, return true, false otherwise.
 	 *
 	 * Note that AMs that do not necessarily update indexes when indexed
-	 * columns do not change, need to return the current/correct version of a
-	 * tuple as appropriate, even if the tid points to an older version of the
-	 * tuple.
+	 * columns do not change, need to return the current/correct version of
+	 * the tuple that is visible to the snapshot, even if the tid points to an
+	 * older version of the tuple.
 	 *
 	 * *call_again is false on the first call to index_fetch_tuple for a tid.
 	 * If there potentially is another tuple matching the tid, *call_again
 	 * needs be set to true by index_fetch_tuple, signalling to the caller
 	 * that index_fetch_tuple should be called again for the same tid.
 	 *
-	 * *all_dead, if all_dead is not NULL, should be set to true if by
+	 * *all_dead, if all_dead is not NULL, should be set to true by
 	 * index_fetch_tuple iff it is guaranteed that no backend needs to see
 	 * that tuple. Index AMs can use that do avoid returning that tid in
 	 * future searches.
@@ -288,14 +292,14 @@ typedef struct TableAmRoutine
 									  TupleTableSlot *slot,
 									  bool *call_again, bool *all_dead);
 
+
 	/* ------------------------------------------------------------------------
 	 * Callbacks for non-modifying operations on individual tuples
 	 * ------------------------------------------------------------------------
 	 */
 
-
 	/*
-	 * Fetch tuple at `tid` into `slot, after doing a visibility test
+	 * Fetch tuple at `tid` into `slot`, after doing a visibility test
 	 * according to `snapshot`. If a tuple was found and passed the visibility
 	 * test, returns true, false otherwise.
 	 */
@@ -390,13 +394,13 @@ typedef struct TableAmRoutine
 	/*
 	 * Perform operations necessary to complete insertions made via
 	 * tuple_insert and multi_insert with a BulkInsertState specified. This
-	 * e.g. may e.g. used to flush the relation when inserting with
-	 * TABLE_INSERT_SKIP_WAL specified.
+	 * may for example be used to flush the relation, when the
+	 * TABLE_INSERT_SKIP_WAL option was used.
 	 *
 	 * Typically callers of tuple_insert and multi_insert will just pass all
-	 * the flags the apply to them, and each AM has to decide which of them
+	 * the flags that apply to them, and each AM has to decide which of them
 	 * make sense for it, and then only take actions in finish_bulk_insert
-	 * that make sense for a specific AM.
+	 * for those flags, and ignore others.
 	 *
 	 * Optional callback.
 	 */
@@ -412,10 +416,10 @@ typedef struct TableAmRoutine
 	 * This callback needs to create a new relation filenode for `rel`, with
 	 * appropriate durability behaviour for `persistence`.
 	 *
-	 * On output *freezeXid, *minmulti should be set to the values appropriate
-	 * for pg_class.{relfrozenxid, relminmxid} have to be set to. For AMs that
-	 * don't need those fields to be filled they can be set to
-	 * InvalidTransactionId, InvalidMultiXactId respectively.
+	 * On output *freezeXid, *minmulti must be set to the values appropriate
+	 * for pg_class.{relfrozenxid, relminmxid}. For AMs that don't need those
+	 * fields to be filled they can be set to InvalidTransactionId and
+	 * InvalidMultiXactId, respectively.
 	 *
 	 * See also table_relation_set_new_filenode().
 	 */
@@ -463,8 +467,8 @@ typedef struct TableAmRoutine
 	 * locked with a ShareUpdateExclusive lock.
 	 *
 	 * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through
-	 * this routine, even if (in the latter case), part of the same VACUUM
-	 * command.
+	 * this routine, even if (for ANALYZE) it is part of the same
+	 * VACUUM command.
 	 *
 	 * There probably, in the future, needs to be a separate callback to
 	 * integrate with autovacuum's scheduling.
@@ -487,8 +491,8 @@ typedef struct TableAmRoutine
 	 * sampling, e.g. because it's a metapage that could never contain tuples.
 	 *
 	 * XXX: This obviously is primarily suited for block-based AMs. It's not
-	 * clear what a good interface for non block based AMs would be, so don't
-	 * try to invent one yet.
+	 * clear what a good interface for non block based AMs would be, so there
+	 * isn't one yet.
 	 */
 	bool		(*scan_analyze_next_block) (TableScanDesc scan,
 											BlockNumber blockno,
@@ -537,7 +541,7 @@ typedef struct TableAmRoutine
 	/*
 	 * See table_relation_estimate_size().
 	 *
-	 * While block oriented, it shouldn't be too hard to for an AM that
+	 * While block oriented, it shouldn't be too hard for an AM that doesn't
 	 * doesn't internally use blocks to convert into a usable representation.
 	 */
 	void		(*relation_estimate_size) (Relation rel, int32 *attr_widths,
@@ -553,7 +557,7 @@ typedef struct TableAmRoutine
 	/*
 	 * Prepare to fetch / check / return tuples from `tbmres->blockno` as part
 	 * of a bitmap table scan. `scan` was started via table_beginscan_bm().
-	 * Return false if there's no tuples to be found on the page, true
+	 * Return false if there are no tuples to be found on the page, true
 	 * otherwise.
 	 *
 	 * This will typically read and pin the target block, and do the necessary
@@ -617,8 +621,8 @@ typedef struct TableAmRoutine
 	 * Note that it's not acceptable to hold deadlock prone resources such as
 	 * lwlocks until scan_sample_next_tuple() has exhausted the tuples on the
 	 * block - the tuple is likely to be returned to an upper query node, and
-	 * the next call could be off a long while. Holding buffer pins etc is
-	 * obviously OK.
+	 * the next call could be off a long while. Holding buffer pins and such
+	 * is obviously OK.
 	 *
 	 * Currently it is required to implement this interface, as there's no
 	 * alternative way (contrary e.g. to bitmap scans) to implement sample
@@ -707,7 +711,6 @@ table_beginscan_strat(Relation rel, Snapshot snapshot,
 									   false, false, false);
 }
 
-
 /*
  * table_beginscan_bm is an alternative entry point for setting up a
  * TableScanDesc for a bitmap heap scan.  Although that scan technology is
@@ -762,7 +765,6 @@ table_endscan(TableScanDesc scan)
 	scan->rs_rd->rd_tableam->scan_end(scan);
 }
 
-
 /*
  * Restart a relation scan.
  */
@@ -795,7 +797,6 @@ table_rescan_set_params(TableScanDesc scan, struct ScanKeyData *key,
  */
 extern void table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot);
 
-
 /*
  * Return next tuple from `scan`, store in slot.
  */
@@ -833,7 +834,7 @@ extern void table_parallelscan_initialize(Relation rel,
  * table_parallelscan_initialize(), for the same relation. The initialization
  * does not need to have happened in this backend.
  *
- * Caller must hold a suitable lock on the correct relation.
+ * Caller must hold a suitable lock on the relation.
  */
 extern TableScanDesc table_beginscan_parallel(Relation rel,
 						 ParallelTableScanDesc pscan);
@@ -904,7 +905,7 @@ table_index_fetch_end(struct IndexFetchTableData *scan)
  * The difference between this function and table_fetch_row_version is that
  * this function returns the currently visible version of a row if the AM
  * supports storing multiple row versions reachable via a single index entry
- * (like heap's HOT). Whereas table_fetch_row_version only evaluates the the
+ * (like heap's HOT). Whereas table_fetch_row_version only evaluates the
  * tuple exactly at `tid`. Outside of index entry ->table tuple lookups,
  * table_fetch_row_version is what's usually needed.
  */
@@ -940,7 +941,7 @@ extern bool table_index_fetch_tuple_check(Relation rel,
 
 
 /*
- * Fetch tuple at `tid` into `slot, after doing a visibility test according to
+ * Fetch tuple at `tid` into `slot`, after doing a visibility test according to
  * `snapshot`. If a tuple was found and passed the visibility test, returns
  * true, false otherwise.
  *
@@ -1009,8 +1010,8 @@ table_compute_xid_horizon_for_tuples(Relation rel,
  * behaviour of the AM. Several options might be ignored by AMs not supporting
  * them.
  *
- * If the TABLE_INSERT_SKIP_WAL option is specified, the new tuple will not
- * necessarily logged to WAL, even for a non-temp relation. It is the AMs
+ * If the TABLE_INSERT_SKIP_WAL option is specified, the new tuple doesn't
+ * need to be logged to WAL, even for a non-temp relation. It is the AMs
  * choice whether this optimization is supported.
  *
  * If the TABLE_INSERT_SKIP_FSM option is specified, AMs are free to not reuse
@@ -1030,7 +1031,7 @@ table_compute_xid_horizon_for_tuples(Relation rel,
  * relation.
  *
  * Note that most of these options will be applied when inserting into the
- * heap's TOAST table, too, if the tuple requires any out-of-line data
+ * heap's TOAST table, too, if the tuple requires any out-of-line data.
  *
  *
  * The BulkInsertState object (if any; bistate can be NULL for default
@@ -1082,7 +1083,7 @@ table_complete_speculative(Relation rel, TupleTableSlot *slot,
 }
 
 /*
- * Insert multiple tuple into a table.
+ * Insert multiple tuples into a table.
  *
  * This is like table_insert(), but inserts multiple tuples in one
  * operation. That's often faster than calling table_insert() in a loop,
@@ -1121,10 +1122,9 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots,
  *	changingPart - true iff the tuple is being moved to another partition
  *		table due to an update of the partition key. Otherwise, false.
  *
- * Normal, successful return value is TM_Ok, which
- * actually means we did delete it.  Failure return codes are
- * TM_SelfModified, TM_Updated, or TM_BeingModified
- * (the last only possible if wait == false).
+ * Normal, successful return value is TM_Ok, which means we did actually
+ * delete it.  Failure return codes are TM_SelfModified, TM_Updated, and
+ * TM_BeingModified (the last only possible if wait == false).
  *
  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
  * t_xmax, and, if possible, and, if possible, t_cmax.  See comments for
@@ -1160,10 +1160,9 @@ table_delete(Relation rel, ItemPointer tid, CommandId cid,
  *  update_indexes - in success cases this is set to true if new index entries
  *		are required for this tuple
  *
- * Normal, successful return value is TM_Ok, which
- * actually means we *did* update it.  Failure return codes are
- * TM_SelfModified, TM_Updated, or TM_BeingModified
- * (the last only possible if wait == false).
+ * Normal, successful return value is TM_Ok, which means we did actually
+ * update it.  Failure return codes are TM_SelfModified, TM_Updated, and
+ * TM_BeingModified (the last only possible if wait == false).
  *
  * On success, the slot's tts_tid and tts_tableOid are updated to match the new
  * stored tuple; in particular, slot->tts_tid is set to the TID where the
@@ -1201,8 +1200,8 @@ table_update(Relation rel, ItemPointer otid, TupleTableSlot *slot,
  *	flags:
  *		If TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS, follow the update chain to
  *		also lock descendant tuples if lock modes don't conflict.
- *		If TUPLE_LOCK_FLAG_FIND_LAST_VERSION, update chain and lock latest
- *		version.
+ *		If TUPLE_LOCK_FLAG_FIND_LAST_VERSION, follow the update chain and lock
+ *		latest version.
  *
  * Output parameters:
  *	*slot: contains the target tuple
@@ -1303,7 +1302,7 @@ table_relation_copy_data(Relation rel, RelFileNode newrnode)
  * is copied in that index's order; if use_sort is false and OidIndex is
  * InvalidOid, no sorting is performed.
  *
- * OldestXmin, FreezeXid, MultiXactCutoff need to currently valid values for
+ * OldestXmin, FreezeXid, MultiXactCutoff must be currently valid values for
  * the table.
  *
  * *num_tuples, *tups_vacuumed, *tups_recently_dead will contain statistics
@@ -1329,15 +1328,15 @@ table_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 }
 
 /*
- * Perform VACUUM on the relation. The VACUUM can be user triggered or by
+ * Perform VACUUM on the relation. The VACUUM can be user-triggered or by
  * autovacuum. The specific actions performed by the AM will depend heavily on
  * the individual AM.
 
  * On entry a transaction needs to already been established, and the
- * transaction is locked with a ShareUpdateExclusive lock.
+ * table is locked with a ShareUpdateExclusive lock.
  *
  * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through this
- * routine, even if (in the latter case), part of the same VACUUM command.
+ * routine, even if (for ANALYZE) it is part of the same VACUUM command.
  */
 static inline void
 table_relation_vacuum(Relation rel, struct VacuumParams *params,
@@ -1363,7 +1362,7 @@ table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
 }
 
 /*
- * Iterate over tuples tuples in the block selected with
+ * Iterate over tuples in the block selected with
  * table_scan_analyze_next_block() (which needs to have returned true, and
  * this routine may not have returned false for the same block before). If a
  * tuple that's suitable for sampling is found, true is returned and a tuple
@@ -1383,7 +1382,7 @@ table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 }
 
 /*
- * table_index_build_range_scan - scan the table to find tuples to be indexed
+ * table_index_build_scan - scan the table to find tuples to be indexed
  *
  * This is called back from an access-method-specific index build procedure
  * after the AM has done whatever setup it needs.  The parent heap relation
@@ -1515,8 +1514,8 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths,
 /*
  * Prepare to fetch / check / return tuples from `tbmres->blockno` as part of
  * a bitmap table scan. `scan` needs to have been started via
- * table_beginscan_bm(). Returns false if there's no tuples to be found on the
- * page, true otherwise.
+ * table_beginscan_bm(). Returns false if there are no tuples to be found on
+ * the page, true otherwise.
  *
  * Note, this is an optionally implemented function, therefore should only be
  * used after verifying the presence (at plan time or such).
-- 
2.20.1

Re: Pluggable Storage - Andres's take

Reply via email to