Hi,

Andres added to CC because of TTS interface: apparently calling
slot_getallattrs() on a virtual slot raises error that "getsomeattrs is
not required to be called on a virtual tuple table slot".  I'm thinking
that this exposes implementation details that should not be necessary
for a caller to know; patch 0001 fixes that at the problematic caller by
making the slot_getallatrs() call conditional on the slot not being
virtual, but I wonder if the better fix isn't to remove the elog(ERROR)
at tts_virtual_getsomeattrs.

Moving on from that -- this is a version of Amit's previous patch that I
like better.  I think the "prev_myslot" thing was a bit ugly, but it
turns out that with the above fix we can clear the slot before creating
the new one, making things more sensible.  I also changed the "do {}
while ()" into a simple "while {}" loop, which is sensible since the
condition is true on loop entrance.  Minor comment rewording also.

-- 
Álvaro Herrera                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
>From ef139f89531ba15f480cbb64c2bddeee03dc20ab Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvhe...@alvh.no-ip.org>
Date: Tue, 8 Sep 2020 16:55:30 -0300
Subject: [PATCH v3 1/2] Don't "getsomeattrs" on virtual slots
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

execute_attr_map_slot() was not careful about not calling
slot_getallattrs() on tuple slots that could be virtual.  Repair.

Author: Álvaro Herrera <alvhe...@alvh.no-ip.org>
---
 src/backend/access/common/tupconvert.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/common/tupconvert.c b/src/backend/access/common/tupconvert.c
index 3cb0cbefaa..d440c1201a 100644
--- a/src/backend/access/common/tupconvert.c
+++ b/src/backend/access/common/tupconvert.c
@@ -193,7 +193,8 @@ execute_attr_map_slot(AttrMap *attrMap,
 	outnatts = out_slot->tts_tupleDescriptor->natts;
 
 	/* Extract all the values of the in slot. */
-	slot_getallattrs(in_slot);
+	if (!TTS_IS_VIRTUAL(in_slot))
+		slot_getallattrs(in_slot);
 
 	/* Before doing the mapping, clear any old contents from the out slot */
 	ExecClearTuple(out_slot);
-- 
2.20.1

>From e5a2b5ddbbb55dd9a83474f3f55b8f8724a3a63e Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvhe...@alvh.no-ip.org>
Date: Thu, 3 Sep 2020 13:18:35 -0400
Subject: [PATCH v3 2/2] Check default partitions constraints while descending
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Partitioning tuple route code assumes that the partition chosen while
descending the partition hierarchy is always the correct one.  This is
true except when the partition is the default partition and another
partition has been added concurrently: the partition constraint changes
and we don't recheck it.  This can lead to tuples mistakenly being added
to the default partition that should have been rejected.

Fix by rechecking the default partition constraint while descending the
hierarchy.

An isolation test based on the reproduction steps described by Hao Wu
(with tweaks for extra coverage) is included.

Reported by: Hao Wu <h...@vmware.com>
Author: Amit Langote <amitlangot...@gmail.com>
Author: Álvaro Herrera <alvhe...@alvh.no-ip.org>
Discussion: https://postgr.es/m/ca+hiwqfqbmcssap4sfncbuel_vfommekaq3gwuhyfa4c7j_...@mail.gmail.com
Discussion: https://postgr.es/m/dm5pr0501mb3910e97a9edfb4c775cf3d75a4...@dm5pr0501mb3910.namprd05.prod.outlook.com
---
 src/backend/executor/execPartition.c          | 127 ++++++++++++++----
 .../expected/partition-concurrent-attach.out  |  49 +++++++
 src/test/isolation/isolation_schedule         |   1 +
 .../specs/partition-concurrent-attach.spec    |  43 ++++++
 4 files changed, 195 insertions(+), 25 deletions(-)
 create mode 100644 src/test/isolation/expected/partition-concurrent-attach.out
 create mode 100644 src/test/isolation/specs/partition-concurrent-attach.spec

diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 79fcbd6b06..bc2372959c 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -51,6 +51,11 @@
  *		PartitionDispatchData->indexes for details on how this array is
  *		indexed.
  *
+ * nonleaf_partitions
+ *		Array of 'max_dispatch' elements containing pointers to fake
+ *		ResultRelInfo objects for nonleaf partitions, useful for checking
+ *		the partition constraint.
+ *
  * num_dispatch
  *		The current number of items stored in the 'partition_dispatch_info'
  *		array.  Also serves as the index of the next free array element for
@@ -89,6 +94,7 @@ struct PartitionTupleRouting
 {
 	Relation	partition_root;
 	PartitionDispatch *partition_dispatch_info;
+	ResultRelInfo **nonleaf_partitions;
 	int			num_dispatch;
 	int			max_dispatch;
 	ResultRelInfo **partitions;
@@ -280,9 +286,11 @@ ExecFindPartition(ModifyTableState *mtstate,
 	PartitionDispatch dispatch;
 	PartitionDesc partdesc;
 	ExprContext *ecxt = GetPerTupleExprContext(estate);
-	TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
+	TupleTableSlot *ecxt_scantuple_saved = ecxt->ecxt_scantuple;
+	TupleTableSlot *rootslot = slot;
 	TupleTableSlot *myslot = NULL;
 	MemoryContext oldcxt;
+	ResultRelInfo *rri = NULL;
 
 	/* use per-tuple context here to avoid leaking memory */
 	oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
@@ -296,9 +304,8 @@ ExecFindPartition(ModifyTableState *mtstate,
 
 	/* start with the root partitioned table */
 	dispatch = pd[0];
-	while (true)
+	while (dispatch != NULL)
 	{
-		AttrMap    *map = dispatch->tupmap;
 		int			partidx = -1;
 
 		CHECK_FOR_INTERRUPTS();
@@ -306,17 +313,6 @@ ExecFindPartition(ModifyTableState *mtstate,
 		rel = dispatch->reldesc;
 		partdesc = dispatch->partdesc;
 
-		/*
-		 * Convert the tuple to this parent's layout, if different from the
-		 * current relation.
-		 */
-		myslot = dispatch->tupslot;
-		if (myslot != NULL)
-		{
-			Assert(map != NULL);
-			slot = execute_attr_map_slot(map, slot, myslot);
-		}
-
 		/*
 		 * Extract partition key from tuple. Expression evaluation machinery
 		 * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
@@ -352,11 +348,9 @@ ExecFindPartition(ModifyTableState *mtstate,
 
 		if (partdesc->is_leaf[partidx])
 		{
-			ResultRelInfo *rri;
-
 			/*
-			 * Look to see if we've already got a ResultRelInfo for this
-			 * partition.
+			 * We've reached the leaf -- hurray, we're done.  Look to see if
+			 * we've already got a ResultRelInfo for this partition.
 			 */
 			if (likely(dispatch->indexes[partidx] >= 0))
 			{
@@ -400,14 +394,10 @@ ExecFindPartition(ModifyTableState *mtstate,
 												dispatch,
 												rootResultRelInfo, partidx);
 			}
+			Assert(rri != NULL);
 
-			/* Release the tuple in the lowest parent's dedicated slot. */
-			if (slot == myslot)
-				ExecClearTuple(myslot);
-
-			MemoryContextSwitchTo(oldcxt);
-			ecxt->ecxt_scantuple = ecxt_scantuple_old;
-			return rri;
+			/* Signal to terminate the loop */
+			dispatch = NULL;
 		}
 		else
 		{
@@ -419,6 +409,8 @@ ExecFindPartition(ModifyTableState *mtstate,
 				/* Already built. */
 				Assert(dispatch->indexes[partidx] < proute->num_dispatch);
 
+				rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
+
 				/*
 				 * Move down to the next partition level and search again
 				 * until we find a leaf partition that matches this tuple
@@ -440,10 +432,75 @@ ExecFindPartition(ModifyTableState *mtstate,
 															dispatch, partidx);
 				Assert(dispatch->indexes[partidx] >= 0 &&
 					   dispatch->indexes[partidx] < proute->num_dispatch);
+
+				rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
 				dispatch = subdispatch;
 			}
+
+			/*
+			 * Convert the tuple to the new parent's layout, if different from
+			 * the previous parent.
+			 */
+			if (dispatch->tupslot)
+			{
+				AttrMap    *map = dispatch->tupmap;
+
+				/* Clear previous parent's tuple if any. */
+				if (myslot != NULL)
+					ExecClearTuple(myslot);
+
+				myslot = dispatch->tupslot;
+				slot = execute_attr_map_slot(map, slot, myslot);
+			}
+		}
+
+		/*
+		 * If this partition is the default one, we must check its partition
+		 * constraint now, which may have changed concurrently due to
+		 * partitions being added to the parent.
+		 *
+		 * (We do this here, and do not rely on ExecInsert doing it, because
+		 * we don't want to miss doing it for non-leaf partitions.)
+		 */
+		if (partidx == partdesc->boundinfo->default_index)
+		{
+			PartitionRoutingInfo *partrouteinfo = rri->ri_PartitionInfo;
+
+			/*
+			 * The tuple must match the partition's layout for the constraint
+			 * expression to be evaluated successfully.  If the partition is
+			 * sub-partitioned, that would already be the case due to the code
+			 * above, but for a leaf partition the tuple still matches the
+			 * parent's layout.
+			 *
+			 * Note that we have a map to convert from root to current
+			 * partition, but not from immediate parent to current partition.
+			 * So if we have to convert, do it from the root slot; if not, use
+			 * the root slot as-is.
+			 */
+			if (partrouteinfo)
+			{
+				TupleConversionMap *map = partrouteinfo->pi_RootToPartitionMap;
+
+				if (map)
+					slot = execute_attr_map_slot(map->attrMap, rootslot,
+												 partrouteinfo->pi_PartitionTupleSlot);
+				else
+					slot = rootslot;
+			}
+
+			ExecPartitionCheck(rri, slot, estate, true);
 		}
 	}
+
+	/* Release the tuple in the lowest parent's dedicated slot. */
+	if (myslot != NULL)
+		ExecClearTuple(myslot);
+	/* and restore ecxt's scantuple */
+	ecxt->ecxt_scantuple = ecxt_scantuple_saved;
+	MemoryContextSwitchTo(oldcxt);
+
+	return rri;
 }
 
 /*
@@ -1060,6 +1117,8 @@ ExecInitPartitionDispatchInfo(EState *estate,
 			proute->max_dispatch = 4;
 			proute->partition_dispatch_info = (PartitionDispatch *)
 				palloc(sizeof(PartitionDispatch) * proute->max_dispatch);
+			proute->nonleaf_partitions = (ResultRelInfo **)
+				palloc(sizeof(ResultRelInfo *) * proute->max_dispatch);
 		}
 		else
 		{
@@ -1067,10 +1126,28 @@ ExecInitPartitionDispatchInfo(EState *estate,
 			proute->partition_dispatch_info = (PartitionDispatch *)
 				repalloc(proute->partition_dispatch_info,
 						 sizeof(PartitionDispatch) * proute->max_dispatch);
+			proute->nonleaf_partitions = (ResultRelInfo **)
+				repalloc(proute->nonleaf_partitions,
+						 sizeof(ResultRelInfo *) * proute->max_dispatch);
 		}
 	}
 	proute->partition_dispatch_info[dispatchidx] = pd;
 
+	/*
+	 * If setting up a PartitionDispatch for a sub-partitioned table, we may
+	 * also need a minimally valid ResultRelInfo for checking the partition
+	 * constraint later; set that up now.
+	 */
+	if (parent_pd)
+	{
+		ResultRelInfo *rri = makeNode(ResultRelInfo);
+
+		InitResultRelInfo(rri, rel, 1, proute->partition_root, 0);
+		proute->nonleaf_partitions[dispatchidx] = rri;
+	}
+	else
+		proute->nonleaf_partitions[dispatchidx] = NULL;
+
 	/*
 	 * Finally, if setting up a PartitionDispatch for a sub-partitioned table,
 	 * install a downlink in the parent to allow quick descent.
diff --git a/src/test/isolation/expected/partition-concurrent-attach.out b/src/test/isolation/expected/partition-concurrent-attach.out
new file mode 100644
index 0000000000..17fac39989
--- /dev/null
+++ b/src/test/isolation/expected/partition-concurrent-attach.out
@@ -0,0 +1,49 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1b s1a s2b s2i s1c s2c s2s
+step s1b: begin;
+step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200);
+step s2b: begin;
+step s2i: insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); <waiting ...>
+step s1c: commit;
+step s2i: <... completed>
+error in steps s1c s2i: ERROR:  new row for relation "tpart_default" violates partition constraint
+step s2c: commit;
+step s2s: select tableoid::regclass, * from tpart;
+tableoid       i              j              
+
+tpart_2        110            xxx            
+tpart_2        120            yyy            
+tpart_2        150            zzz            
+
+starting permutation: s1b s1a s2b s2i2 s1c s2c s2s
+step s1b: begin;
+step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200);
+step s2b: begin;
+step s2i2: insert into tpart_default (i, j) values (110, 'xxx'), (120, 'yyy'), (150, 'zzz'); <waiting ...>
+step s1c: commit;
+step s2i2: <... completed>
+error in steps s1c s2i2: ERROR:  new row for relation "tpart_default" violates partition constraint
+step s2c: commit;
+step s2s: select tableoid::regclass, * from tpart;
+tableoid       i              j              
+
+tpart_2        110            xxx            
+tpart_2        120            yyy            
+tpart_2        150            zzz            
+
+starting permutation: s1b s2b s2i s1a s2c s1c s2s
+step s1b: begin;
+step s2b: begin;
+step s2i: insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz');
+step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200); <waiting ...>
+step s2c: commit;
+step s1a: <... completed>
+error in steps s2c s1a: ERROR:  updated partition constraint for default partition "tpart_default_default" would be violated by some row
+step s1c: commit;
+step s2s: select tableoid::regclass, * from tpart;
+tableoid       i              j              
+
+tpart_default_default110            xxx            
+tpart_default_default120            yyy            
+tpart_default_default150            zzz            
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
index 6acbb695ec..aa386ab1a2 100644
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -81,6 +81,7 @@ test: vacuum-skip-locked
 test: predicate-hash
 test: predicate-gist
 test: predicate-gin
+test: partition-concurrent-attach
 test: partition-key-update-1
 test: partition-key-update-2
 test: partition-key-update-3
diff --git a/src/test/isolation/specs/partition-concurrent-attach.spec b/src/test/isolation/specs/partition-concurrent-attach.spec
new file mode 100644
index 0000000000..48c3f83e0c
--- /dev/null
+++ b/src/test/isolation/specs/partition-concurrent-attach.spec
@@ -0,0 +1,43 @@
+# Verify that default partition constraint is enforced correctly
+# in light of partitions being added concurrently to its parent
+setup {
+  drop table if exists tpart;
+  create table tpart(i int, j text) partition by range(i);
+  create table tpart_1(like tpart);
+  create table tpart_2(like tpart);
+  create table tpart_default (a int, j text, i int) partition by list (j);
+  create table tpart_default_default (a int, i int, b int, j text);
+  alter table tpart_default_default drop b;
+  alter table tpart_default attach partition tpart_default_default default;
+  alter table tpart_default drop a;
+  alter table tpart attach partition tpart_default default;
+  alter table tpart attach partition tpart_1 for values from(0) to (100);
+  insert into tpart_2 values (110,'xxx'), (120, 'yyy'), (150, 'zzz');
+}
+
+session "s1"
+step "s1b"	{	begin; }
+step "s1a"	{	alter table tpart attach partition tpart_2 for values from (100) to (200); }
+step "s1c"	{	commit; }
+
+session "s2"
+step "s2b"	{	begin; }
+step "s2i"	{	insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); }
+step "s2i2"	{	insert into tpart_default (i, j) values (110, 'xxx'), (120, 'yyy'), (150, 'zzz'); }
+step "s2c"	{	commit; }
+step "s2s"	{	select tableoid::regclass, * from tpart; }
+
+teardown	{	drop table tpart; }
+
+# insert into tpart by s2 which routes to tpart_default due to not seeing
+# concurrently added tpart_2 should fail, because the partition constraint
+# of tpart_default would have changed due to tpart_2 having been added
+permutation "s1b" "s1a" "s2b" "s2i" "s1c" "s2c" "s2s"
+
+# similar to above, but now insert into sub-partitioned tpart_default
+permutation "s1b" "s1a" "s2b" "s2i2" "s1c" "s2c" "s2s"
+
+# reverse: now the insert into tpart_default by s2 occurs first followed by
+# attach in s1, which should fail when it scans the leaf default partition
+# find the violating rows
+permutation "s1b" "s2b" "s2i" "s1a" "s2c" "s1c" "s2s"
-- 
2.20.1

Reply via email to