From cdb9fc24e192ff50c48dbb874742c51abb83c1cd Mon Sep 17 00:00:00 2001
From: amitlan <amitlangote09@gmail.com>
Date: Sat, 23 Jan 2021 16:35:21 +0900
Subject: [PATCH v5] Allow batching of inserts to occur in some cases

Currently, the insert component of a cross-partition update of a
partitioned table inadvertently tries to use batching but it doesn't
work for two main reasons:

a) postgresGetForeignModifyBatchSize() looks at the wrong
PgFdwModifyState, one that does not belong to the insert, so has no
batching information.

b) ExecModifyTable(), when inserting any remaining batched tuples,
would fail to look at the ResultRelInfos used by such inserts because
they are not present in es_tuple_routing_result_relations, which
would result in those tuples not actually getting inserted.

This commit fixes both (a) and (b) so that those inserts can
correctly use batching.

To fix (a), postgresGetForeignModifyBatchSize() now uses the
PgFdwModifyState that actually belongs to the insert which contains
the information necessary to perform batching.

To fix (b), ExecModifyTable() now gets the ResultRelInfos to insert
any remaining batched tuples from the PartitionTupleRouting data
structure which does contain the ResultRelInfo that would be used
by the batched insert.  To implement this, this commit exposes the
definition of PartitionTupleRouting which was previously local to
execPartition.c.
---
 .../postgres_fdw/expected/postgres_fdw.out    | 23 +++++-
 contrib/postgres_fdw/postgres_fdw.c           | 12 +++-
 contrib/postgres_fdw/sql/postgres_fdw.sql     | 19 ++++-
 src/backend/executor/execPartition.c          | 69 ------------------
 src/backend/executor/nodeModifyTable.c        | 38 +++++++---
 src/include/executor/execPartition.h          | 72 ++++++++++++++++++-
 6 files changed, 150 insertions(+), 83 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 60c7e115d6..3326f1b542 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -9414,5 +9414,26 @@ SELECT COUNT(*) FROM batch_table;
     66
 (1 row)
 
+-- Check that enabling batched inserts doesn't interfere with cross-partition
+-- updates
+CREATE TABLE batch_cp_upd_test (a int) PARTITION BY LIST (a);
+CREATE TABLE batch_cp_upd_test1 (LIKE batch_cp_upd_test);
+CREATE FOREIGN TABLE batch_cp_upd_test1_f
+	PARTITION OF batch_cp_upd_test
+	FOR VALUES IN (1)
+	SERVER loopback
+	OPTIONS (table_name 'batch_cp_upd_test1', batch_size '10');
+CREATE TABLE batch_cp_up_test1 PARTITION OF batch_cp_upd_test
+	FOR VALUES IN (2);
+INSERT INTO batch_cp_upd_test VALUES (1), (2);
+-- The following moves a row from the local partition to the foreign one
+UPDATE batch_cp_upd_test t SET a = 1 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a;
+SELECT tableoid::regclass, * FROM batch_cp_upd_test;
+       tableoid       | a 
+----------------------+---
+ batch_cp_upd_test1_f | 1
+ batch_cp_upd_test1_f | 1
+(2 rows)
+
 -- Clean up
-DROP TABLE batch_table CASCADE;
+DROP TABLE batch_table, batch_cp_upd_test CASCADE;
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 368997d9d1..53472cf73c 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -1934,17 +1934,25 @@ static int
 postgresGetForeignModifyBatchSize(ResultRelInfo *resultRelInfo)
 {
 	int	batch_size;
+	PgFdwModifyState *fmstate = (PgFdwModifyState *) resultRelInfo->ri_FdwState;
 
 	/* should be called only once */
 	Assert(resultRelInfo->ri_BatchSize == 0);
 
+	/*
+	 * Use the auxiliary state if any; see postgresBeginForeignInsert() for
+	 * details on what it represents.
+	 */
+	if (fmstate && fmstate->aux_fmstate != NULL)
+		fmstate = fmstate->aux_fmstate;
+
 	/*
 	 * In EXPLAIN without ANALYZE, ri_fdwstate is NULL, so we have to lookup
 	 * the option directly in server/table options. Otherwise just use the
 	 * value we determined earlier.
 	 */
-	if (resultRelInfo->ri_FdwState)
-		batch_size = ((PgFdwModifyState *) resultRelInfo->ri_FdwState)->batch_size;
+	if (fmstate)
+		batch_size = fmstate->batch_size;
 	else
 		batch_size = get_batch_size_option(resultRelInfo->ri_RelationDesc);
 
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 151f4f1834..2b525ea44a 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -2909,5 +2909,22 @@ CREATE TABLE batch_table_p2
 INSERT INTO batch_table SELECT * FROM generate_series(1, 66) i;
 SELECT COUNT(*) FROM batch_table;
 
+-- Check that enabling batched inserts doesn't interfere with cross-partition
+-- updates
+CREATE TABLE batch_cp_upd_test (a int) PARTITION BY LIST (a);
+CREATE TABLE batch_cp_upd_test1 (LIKE batch_cp_upd_test);
+CREATE FOREIGN TABLE batch_cp_upd_test1_f
+	PARTITION OF batch_cp_upd_test
+	FOR VALUES IN (1)
+	SERVER loopback
+	OPTIONS (table_name 'batch_cp_upd_test1', batch_size '10');
+CREATE TABLE batch_cp_up_test1 PARTITION OF batch_cp_upd_test
+	FOR VALUES IN (2);
+INSERT INTO batch_cp_upd_test VALUES (1), (2);
+
+-- The following moves a row from the local partition to the foreign one
+UPDATE batch_cp_upd_test t SET a = 1 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a;
+SELECT tableoid::regclass, * FROM batch_cp_upd_test;
+
 -- Clean up
-DROP TABLE batch_table CASCADE;
+DROP TABLE batch_table, batch_cp_upd_test CASCADE;
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index b9e4f2d80b..811997b4c8 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -35,75 +35,6 @@
 #include "utils/ruleutils.h"
 
 
-/*-----------------------
- * PartitionTupleRouting - Encapsulates all information required to
- * route a tuple inserted into a partitioned table to one of its leaf
- * partitions.
- *
- * partition_root
- *		The partitioned table that's the target of the command.
- *
- * partition_dispatch_info
- *		Array of 'max_dispatch' elements containing a pointer to a
- *		PartitionDispatch object for every partitioned table touched by tuple
- *		routing.  The entry for the target partitioned table is *always*
- *		present in the 0th element of this array.  See comment for
- *		PartitionDispatchData->indexes for details on how this array is
- *		indexed.
- *
- * nonleaf_partitions
- *		Array of 'max_dispatch' elements containing pointers to fake
- *		ResultRelInfo objects for nonleaf partitions, useful for checking
- *		the partition constraint.
- *
- * num_dispatch
- *		The current number of items stored in the 'partition_dispatch_info'
- *		array.  Also serves as the index of the next free array element for
- *		new PartitionDispatch objects that need to be stored.
- *
- * max_dispatch
- *		The current allocated size of the 'partition_dispatch_info' array.
- *
- * partitions
- *		Array of 'max_partitions' elements containing a pointer to a
- *		ResultRelInfo for every leaf partitions touched by tuple routing.
- *		Some of these are pointers to ResultRelInfos which are borrowed out of
- *		'subplan_resultrel_htab'.  The remainder have been built especially
- *		for tuple routing.  See comment for PartitionDispatchData->indexes for
- *		details on how this array is indexed.
- *
- * num_partitions
- *		The current number of items stored in the 'partitions' array.  Also
- *		serves as the index of the next free array element for new
- *		ResultRelInfo objects that need to be stored.
- *
- * max_partitions
- *		The current allocated size of the 'partitions' array.
- *
- * subplan_resultrel_htab
- *		Hash table to store subplan ResultRelInfos by Oid.  This is used to
- *		cache ResultRelInfos from subplans of an UPDATE ModifyTable node;
- *		NULL in other cases.  Some of these may be useful for tuple routing
- *		to save having to build duplicates.
- *
- * memcxt
- *		Memory context used to allocate subsidiary structs.
- *-----------------------
- */
-struct PartitionTupleRouting
-{
-	Relation	partition_root;
-	PartitionDispatch *partition_dispatch_info;
-	ResultRelInfo **nonleaf_partitions;
-	int			num_dispatch;
-	int			max_dispatch;
-	ResultRelInfo **partitions;
-	int			num_partitions;
-	int			max_partitions;
-	HTAB	   *subplan_resultrel_htab;
-	MemoryContext memcxt;
-};
-
 /*-----------------------
  * PartitionDispatch - information about one partitioned table in a partition
  * hierarchy required to route a tuple to any of its partitions.  A
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 2993ba43e3..e38d7472dc 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -2059,8 +2059,9 @@ ExecModifyTable(PlanState *pstate)
 	HeapTupleData oldtupdata;
 	HeapTuple	oldtuple;
 	PartitionTupleRouting *proute = node->mt_partition_tuple_routing;
-	List				  *relinfos = NIL;
-	ListCell			  *lc;
+	ResultRelInfo **relinfos;
+	int			nrels;
+	int			i;
 
 	CHECK_FOR_INTERRUPTS();
 
@@ -2277,17 +2278,38 @@ ExecModifyTable(PlanState *pstate)
 	}
 
 	/*
-	 * Insert remaining tuples for batch insert.
+	 * Insert any remaining batched tuples.
+	 *
+	 * If the query's main target relation is a partitioned table, any inserts
+	 * would have been performed using tuple routing, so use the appropriate
+	 * set of target relations.  Note that this also covers any inserts
+	 * performed during cross-partition UPDATEs that would have occurred
+	 * through tuple routing.
 	 */
 	if (proute)
-		relinfos = estate->es_tuple_routing_result_relations;
+	{
+		Assert(node->rootResultRelInfo->ri_RelationDesc->rd_rel->relkind ==
+			   RELKIND_PARTITIONED_TABLE);
+		relinfos = proute->partitions;
+		nrels = proute->num_partitions;
+	}
 	else
-		relinfos = estate->es_opened_result_relations;
+	{
+		/* Otherwise, use result relations from the range table. */
+		relinfos = estate->es_result_relations;
+		nrels = estate->es_range_table_size;
+	}
 
-	foreach(lc, relinfos)
+	for (i = 0; i < nrels; i++)
 	{
-		resultRelInfo = lfirst(lc);
-		if (resultRelInfo->ri_NumSlots > 0)
+		resultRelInfo = relinfos[i];
+
+		/*
+		 * es_result_relations array is same length as the range table though
+		 * not all relations in it are necessarily result relations, so some
+		 * entries in it may be NULL.
+		 */
+		if (resultRelInfo && resultRelInfo->ri_NumSlots > 0)
 			ExecBatchInsert(node, resultRelInfo,
 						   resultRelInfo->ri_Slots,
 						   resultRelInfo->ri_PlanSlots,
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index d30ffde7d9..846261ef88 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -18,9 +18,77 @@
 #include "nodes/plannodes.h"
 #include "partitioning/partprune.h"
 
-/* See execPartition.c for the definitions. */
+/* See execPartition.c for the definition. */
 typedef struct PartitionDispatchData *PartitionDispatch;
-typedef struct PartitionTupleRouting PartitionTupleRouting;
+
+/*-----------------------
+ * PartitionTupleRouting - Encapsulates all information required to
+ * route a tuple inserted into a partitioned table to one of its leaf
+ * partitions.
+ *
+ * partition_root
+ *		The partitioned table that's the target of the command.
+ *
+ * partition_dispatch_info
+ *		Array of 'max_dispatch' elements containing a pointer to a
+ *		PartitionDispatch object for every partitioned table touched by tuple
+ *		routing.  The entry for the target partitioned table is *always*
+ *		present in the 0th element of this array.  See comment for
+ *		PartitionDispatchData->indexes for details on how this array is
+ *		indexed.
+ *
+ * nonleaf_partitions
+ *		Array of 'max_dispatch' elements containing pointers to fake
+ *		ResultRelInfo objects for nonleaf partitions, useful for checking
+ *		the partition constraint.
+ *
+ * num_dispatch
+ *		The current number of items stored in the 'partition_dispatch_info'
+ *		array.  Also serves as the index of the next free array element for
+ *		new PartitionDispatch objects that need to be stored.
+ *
+ * max_dispatch
+ *		The current allocated size of the 'partition_dispatch_info' array.
+ *
+ * partitions
+ *		Array of 'max_partitions' elements containing a pointer to a
+ *		ResultRelInfo for every leaf partitions touched by tuple routing.
+ *		Some of these are pointers to ResultRelInfos which are borrowed out of
+ *		'subplan_resultrel_htab'.  The remainder have been built especially
+ *		for tuple routing.  See comment for PartitionDispatchData->indexes for
+ *		details on how this array is indexed.
+ *
+ * num_partitions
+ *		The current number of items stored in the 'partitions' array.  Also
+ *		serves as the index of the next free array element for new
+ *		ResultRelInfo objects that need to be stored.
+ *
+ * max_partitions
+ *		The current allocated size of the 'partitions' array.
+ *
+ * subplan_resultrel_htab
+ *		Hash table to store subplan ResultRelInfos by Oid.  This is used to
+ *		cache ResultRelInfos from subplans of an UPDATE ModifyTable node;
+ *		NULL in other cases.  Some of these may be useful for tuple routing
+ *		to save having to build duplicates.
+ *
+ * memcxt
+ *		Memory context used to allocate subsidiary structs.
+ *-----------------------
+ */
+typedef struct PartitionTupleRouting
+{
+	Relation	partition_root;
+	PartitionDispatch *partition_dispatch_info;
+	ResultRelInfo **nonleaf_partitions;
+	int			num_dispatch;
+	int			max_dispatch;
+	ResultRelInfo **partitions;
+	int			num_partitions;
+	int			max_partitions;
+	HTAB	   *subplan_resultrel_htab;
+	MemoryContext memcxt;
+} PartitionTupleRouting;
 
 /*
  * PartitionedRelPruningData - Per-partitioned-table data for run-time pruning
-- 
2.24.1

