From 34e5992d122998b756b631ebf689267b03032504 Mon Sep 17 00:00:00 2001
From: amitlan <amitlangote09@gmail.com>
Date: Thu, 3 Sep 2020 15:53:04 +0900
Subject: [PATCH v1] Check default partition's constraint even after tuple
 routing

A partition's constraint is not checked if a row has been inserted
into it via tuple routing, because the routing process ensures that
only the tuples satisfying the constraint make it to a given
partition.  For a default partition however, whose constraint can
change on-the-fly due to concurrent addition of partitions, this
assumption does not always hold.  So, check the constraint even
when inserting into a default partition via tuple routing.

This also adds an isolation test based on the reproduction steps
described by Hao Wu.

Reported by: Hao Wu <hawu@vmware.com>
---
 src/backend/executor/execPartition.c               | 62 ++++++++++++++++++++--
 .../expected/partition-concurrent-attach.out       | 21 ++++++++
 src/test/isolation/isolation_schedule              |  1 +
 .../specs/partition-concurrent-attach.spec         | 34 ++++++++++++
 4 files changed, 113 insertions(+), 5 deletions(-)
 create mode 100644 src/test/isolation/expected/partition-concurrent-attach.out
 create mode 100644 src/test/isolation/specs/partition-concurrent-attach.spec

diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 79fcbd6..e231b32 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -51,6 +51,13 @@
  *		PartitionDispatchData->indexes for details on how this array is
  *		indexed.
  *
+ * nonleaf_partitions
+ * 		Array of 'max_dispatch' elements containing pointers to
+ * 		ResultRelInfos of nonleaf partitions touched by tuple routing.  These
+ * 		are currently only used for checking the partition's constraint if
+ * 		the nonleaf partition happens to be a default partition of its
+ * 		parent
+ *
  * num_dispatch
  *		The current number of items stored in the 'partition_dispatch_info'
  *		array.  Also serves as the index of the next free array element for
@@ -89,6 +96,7 @@ struct PartitionTupleRouting
 {
 	Relation	partition_root;
 	PartitionDispatch *partition_dispatch_info;
+	ResultRelInfo **nonleaf_partitions;
 	int			num_dispatch;
 	int			max_dispatch;
 	ResultRelInfo **partitions;
@@ -283,6 +291,7 @@ ExecFindPartition(ModifyTableState *mtstate,
 	TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
 	TupleTableSlot *myslot = NULL;
 	MemoryContext oldcxt;
+	ResultRelInfo *rri = NULL;
 
 	/* use per-tuple context here to avoid leaking memory */
 	oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
@@ -352,8 +361,6 @@ ExecFindPartition(ModifyTableState *mtstate,
 
 		if (partdesc->is_leaf[partidx])
 		{
-			ResultRelInfo *rri;
-
 			/*
 			 * Look to see if we've already got a ResultRelInfo for this
 			 * partition.
@@ -405,9 +412,8 @@ ExecFindPartition(ModifyTableState *mtstate,
 			if (slot == myslot)
 				ExecClearTuple(myslot);
 
-			MemoryContextSwitchTo(oldcxt);
-			ecxt->ecxt_scantuple = ecxt_scantuple_old;
-			return rri;
+			/* We've reached the leaf. */
+			dispatch = NULL;
 		}
 		else
 		{
@@ -419,6 +425,8 @@ ExecFindPartition(ModifyTableState *mtstate,
 				/* Already built. */
 				Assert(dispatch->indexes[partidx] < proute->num_dispatch);
 
+				rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
+
 				/*
 				 * Move down to the next partition level and search again
 				 * until we find a leaf partition that matches this tuple
@@ -440,10 +448,37 @@ ExecFindPartition(ModifyTableState *mtstate,
 															dispatch, partidx);
 				Assert(dispatch->indexes[partidx] >= 0 &&
 					   dispatch->indexes[partidx] < proute->num_dispatch);
+
+				rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
 				dispatch = subdispatch;
 			}
 		}
+
+		/*
+		 * If this partition is the default one, we must check its partition
+		 * constraint, because it may have changed due to partitions being
+		 * added to the parent concurrently.  We do the check here instead of
+		 * in ExecInsert(), because we would not want to miss checking the
+		 * constraint of any nonleaf partitions as they never make it to
+		 * ExecInsert().
+		 */
+		if (partidx == partdesc->boundinfo->default_index)
+		{
+			Assert(rri != NULL);
+			ExecPartitionCheck(rri, slot, estate, true);
+		}
+
+		/* Break out and return if we've found the leaf partition. */
+		if (dispatch == NULL)
+			break;
 	}
+
+	MemoryContextSwitchTo(oldcxt);
+	ecxt->ecxt_scantuple = ecxt_scantuple_old;
+
+	Assert(rri != NULL);
+
+	return rri;
 }
 
 /*
@@ -992,6 +1027,7 @@ ExecInitPartitionDispatchInfo(EState *estate,
 	Relation	rel;
 	PartitionDesc partdesc;
 	PartitionDispatch pd;
+	ResultRelInfo *rri = NULL;
 	int			dispatchidx;
 	MemoryContext oldcxt;
 
@@ -1060,6 +1096,8 @@ ExecInitPartitionDispatchInfo(EState *estate,
 			proute->max_dispatch = 4;
 			proute->partition_dispatch_info = (PartitionDispatch *)
 				palloc(sizeof(PartitionDispatch) * proute->max_dispatch);
+			proute->nonleaf_partitions = (ResultRelInfo **)
+				palloc(sizeof(ResultRelInfo *) * proute->max_dispatch);
 		}
 		else
 		{
@@ -1067,6 +1105,9 @@ ExecInitPartitionDispatchInfo(EState *estate,
 			proute->partition_dispatch_info = (PartitionDispatch *)
 				repalloc(proute->partition_dispatch_info,
 						 sizeof(PartitionDispatch) * proute->max_dispatch);
+			proute->nonleaf_partitions = (ResultRelInfo **)
+				repalloc(proute->nonleaf_partitions,
+						 sizeof(ResultRelInfo *) * proute->max_dispatch);
 		}
 	}
 	proute->partition_dispatch_info[dispatchidx] = pd;
@@ -1081,6 +1122,17 @@ ExecInitPartitionDispatchInfo(EState *estate,
 		parent_pd->indexes[partidx] = dispatchidx;
 	}
 
+	/*
+	 * Also create a minimally valid ResultRelInfo to check the partition's
+	 * partition's constraint.
+	 */
+	if (rel->rd_rel->relispartition)
+	{
+		rri = makeNode(ResultRelInfo);
+		InitResultRelInfo(rri, rel, 1, proute->partition_root, 0);
+	}
+	proute->nonleaf_partitions[dispatchidx] = rri;
+
 	MemoryContextSwitchTo(oldcxt);
 
 	return pd;
diff --git a/src/test/isolation/expected/partition-concurrent-attach.out b/src/test/isolation/expected/partition-concurrent-attach.out
new file mode 100644
index 0000000..553bb44
--- /dev/null
+++ b/src/test/isolation/expected/partition-concurrent-attach.out
@@ -0,0 +1,21 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1b s1a s2b s2i s1c s2c
+step s1b: begin;
+step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200);
+step s2b: begin;
+step s2i: insert into tpart values (110,110),(120,120),(150,150); <waiting ...>
+step s1c: commit;
+step s2i: <... completed>
+error in steps s1c s2i: ERROR:  new row for relation "tpart_default" violates partition constraint
+step s2c: commit;
+
+starting permutation: s1b s2b s2i s1a s2c s1c
+step s1b: begin;
+step s2b: begin;
+step s2i: insert into tpart values (110,110),(120,120),(150,150);
+step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200); <waiting ...>
+step s2c: commit;
+step s1a: <... completed>
+error in steps s2c s1a: ERROR:  updated partition constraint for default partition "tpart_default" would be violated by some row
+step s1c: commit;
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
index 218c87b..56c08fd 100644
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -88,3 +88,4 @@ test: plpgsql-toast
 test: truncate-conflict
 test: serializable-parallel
 test: serializable-parallel-2
+test: partition-concurrent-attach
diff --git a/src/test/isolation/specs/partition-concurrent-attach.spec b/src/test/isolation/specs/partition-concurrent-attach.spec
new file mode 100644
index 0000000..5290e8a
--- /dev/null
+++ b/src/test/isolation/specs/partition-concurrent-attach.spec
@@ -0,0 +1,34 @@
+# Verify that default partition constraint is enforced correctly
+# in light of partitions being added concurrently to its parent
+setup {
+drop table if exists tpart;
+  create table tpart(i int, j int) partition by range(i);
+  create table tpart_1(like tpart);
+  create table tpart_2(like tpart);
+  create table tpart_default(like tpart);
+  alter table tpart attach partition tpart_1 for values from(0) to (100);
+  alter table tpart attach partition tpart_default default;
+  insert into tpart_2 values(110,110),(120,120),(150,150);
+}
+
+session "s1"
+step "s1b"	{	begin; }
+step "s1a"	{	alter table tpart attach partition tpart_2 for values from (100) to (200); }
+step "s1c"	{	commit; }
+
+session "s2"
+step "s2b"	{	begin; }
+step "s2i"	{	insert into tpart values (110,110),(120,120),(150,150); }
+step "s2c"	{	commit; }
+
+teardown	{	drop table tpart; }
+
+# insert into tpart by s2 which routes to tpart_default due to not seeing
+# concurrently added tpart_2 should fail, because the partition constraint
+# of tpart_default would have changed due to tpart_2 having been added
+permutation "s1b" "s1a" "s2b" "s2i" "s1c" "s2c"
+
+# reverse: now the insert into tpart_default by s2 occurs first followed by
+# attach in s1, which should fail when it scans the default partitions to
+# find the violating rows
+permutation "s1b" "s2b" "s2i" "s1a" "s2c" "s1c"
-- 
1.8.3.1

