Hi.

I have a patch that rearranges the code around partition tuple-routing,
such that allocation of per-partition objects (ResultRelInfo,
TupleConversionMap, etc.) is delayed until a given partition is actually
inserted into (i.e., a tuple is routed to it).  I can see good win for
non-bulk inserts with the patch and the patch is implemented such that it
doesn't affect the bulk-insert case much.

Performance numbers:

* Uses following hash-partitioned table:

create table t1 (a int, b int) partition by hash (a);
create table t1_x partition of t1 for values with (modulus M, remainder R)
...


* Non-bulk insert uses the following code (insert 100,000 rows one-by-one):

do $$
begin
  for i in 1..100000 loop
    insert into t1 values (i, i+1);
  end loop;
end; $$;

* Times in milliseconds:

#parts           HEAD        Patched

     8       6216.300       4977.670
    16       9061.388       6360.093
    32      14081.656       8752.405
    64      24887.110      13919.384
   128      45926.251      24582.411
   256      88088.084      45490.894

As you can see the performance can be as much as 2x faster with the patch,
although time taken still increases as the number of partitions increases,
because we still lock *all* partitions at the beginning.

* Bulk-inserting 100,000 rows using COPY:

copy t1 from '/tmp/t1.csv' csv;

* Times in milliseconds:

#parts           HEAD        Patched

     8        458.301        450.875
    16        409.271        510.723
    32        500.960        612.003
    64        430.687        795.046
   128        449.314        565.786
   256        493.171        490.187

Not much harm here, although numbers are a bit noisy.

Patch is divided into 4, first 3 of which are refactoring patches.

I know this patch will conflict severely with [1] and [2], so it's fine if
we consider applying these later.  Will add this to next CF.

Thanks,
Amit

[1] https://commitfest.postgresql.org/16/1023/

[2] https://commitfest.postgresql.org/16/1184/
From a87be8a84d467d65cc0b6cf02655fc3b2b9a458f Mon Sep 17 00:00:00 2001
From: amit <amitlangot...@gmail.com>
Date: Tue, 19 Dec 2017 10:43:45 +0900
Subject: [PATCH 1/4] Teach CopyFrom to use ModifyTableState for tuple-routing

This removes all fields of CopyStateData that were meant for
tuple routing and instead uses ModifyTableState that has all those
fields, including transition_tupconv_maps.  In COPY's case,
transition_tupconv_maps is only required if tuple routing is being
used, so it's safe.
---
 src/backend/commands/copy.c | 79 ++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 254be28ae4..c82103e1c5 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -166,14 +166,7 @@ typedef struct CopyStateData
        bool            volatile_defexprs;      /* is any of defexprs volatile? 
*/
        List       *range_table;
 
-       PartitionDispatch *partition_dispatch_info;
-       int                     num_dispatch;   /* Number of entries in the 
above array */
-       int                     num_partitions; /* Number of members in the 
following arrays */
-       ResultRelInfo **partitions; /* Per partition result relation pointers */
-       TupleConversionMap **partition_tupconv_maps;
-       TupleTableSlot *partition_tuple_slot;
        TransitionCaptureState *transition_capture;
-       TupleConversionMap **transition_tupconv_maps;
 
        /*
         * These variables are used to reduce overhead in textual COPY FROM.
@@ -2289,6 +2282,7 @@ CopyFrom(CopyState cstate)
        ResultRelInfo *resultRelInfo;
        ResultRelInfo *saved_resultRelInfo = NULL;
        EState     *estate = CreateExecutorState(); /* for ExecConstraints() */
+       ModifyTableState *mtstate = makeNode(ModifyTableState);
        ExprContext *econtext;
        TupleTableSlot *myslot;
        MemoryContext oldcontext = CurrentMemoryContext;
@@ -2478,22 +2472,28 @@ CopyFrom(CopyState cstate)
                TupleTableSlot *partition_tuple_slot;
                int                     num_parted,
                                        num_partitions;
-
-               ExecSetupPartitionTupleRouting(NULL,
+               ModifyTable *node = makeNode(ModifyTable);
+
+               /* Just need make this field appear valid. */
+               node->nominalRelation = 1;
+               mtstate->ps.plan = (Plan *) node;
+               mtstate->ps.state = estate;
+               mtstate->resultRelInfo = resultRelInfo;
+               ExecSetupPartitionTupleRouting(mtstate,
                                                                           
cstate->rel,
-                                                                          1,
+                                                                          
node->nominalRelation,
                                                                           
estate,
                                                                           
&partition_dispatch_info,
                                                                           
&partitions,
                                                                           
&partition_tupconv_maps,
                                                                           
&partition_tuple_slot,
                                                                           
&num_parted, &num_partitions);
-               cstate->partition_dispatch_info = partition_dispatch_info;
-               cstate->num_dispatch = num_parted;
-               cstate->partitions = partitions;
-               cstate->num_partitions = num_partitions;
-               cstate->partition_tupconv_maps = partition_tupconv_maps;
-               cstate->partition_tuple_slot = partition_tuple_slot;
+               mtstate->mt_partition_dispatch_info = partition_dispatch_info;
+               mtstate->mt_num_dispatch = num_parted;
+               mtstate->mt_partitions = partitions;
+               mtstate->mt_num_partitions = num_partitions;
+               mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
+               mtstate->mt_partition_tuple_slot = partition_tuple_slot;
 
                /*
                 * If we are capturing transition tuples, they may need to be
@@ -2505,12 +2505,13 @@ CopyFrom(CopyState cstate)
                {
                        int                     i;
 
-                       cstate->transition_tupconv_maps = (TupleConversionMap 
**)
-                               palloc0(sizeof(TupleConversionMap *) * 
cstate->num_partitions);
-                       for (i = 0; i < cstate->num_partitions; ++i)
+                       mtstate->mt_transition_tupconv_maps = 
(TupleConversionMap **)
+                                                                               
palloc0(sizeof(TupleConversionMap *) *
+                                                                               
                mtstate->mt_num_partitions);
+                       for (i = 0; i < mtstate->mt_num_partitions; ++i)
                        {
-                               cstate->transition_tupconv_maps[i] =
-                                       
convert_tuples_by_name(RelationGetDescr(cstate->partitions[i]->ri_RelationDesc),
+                               mtstate->mt_transition_tupconv_maps[i] =
+                                       
convert_tuples_by_name(RelationGetDescr(mtstate->mt_partitions[i]->ri_RelationDesc),
                                                                                
   RelationGetDescr(cstate->rel),
                                                                                
   gettext_noop("could not convert row type"));
                        }
@@ -2530,7 +2531,7 @@ CopyFrom(CopyState cstate)
        if ((resultRelInfo->ri_TrigDesc != NULL &&
                 (resultRelInfo->ri_TrigDesc->trig_insert_before_row ||
                  resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) ||
-               cstate->partition_dispatch_info != NULL ||
+               mtstate->mt_partition_dispatch_info != NULL ||
                cstate->volatile_defexprs)
        {
                useHeapMultiInsert = false;
@@ -2605,7 +2606,7 @@ CopyFrom(CopyState cstate)
                ExecStoreTuple(tuple, slot, InvalidBuffer, false);
 
                /* Determine the partition to heap_insert the tuple into */
-               if (cstate->partition_dispatch_info)
+               if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
                {
                        int                     leaf_part_index;
                        TupleConversionMap *map;
@@ -2619,11 +2620,11 @@ CopyFrom(CopyState cstate)
                         * partition, respectively.
                         */
                        leaf_part_index = ExecFindPartition(resultRelInfo,
-                                                                               
                cstate->partition_dispatch_info,
+                                                                               
mtstate->mt_partition_dispatch_info,
                                                                                
                slot,
                                                                                
                estate);
                        Assert(leaf_part_index >= 0 &&
-                                  leaf_part_index < cstate->num_partitions);
+                                  leaf_part_index < 
mtstate->mt_num_partitions);
 
                        /*
                         * If this tuple is mapped to a partition that is not 
same as the
@@ -2641,7 +2642,8 @@ CopyFrom(CopyState cstate)
                         * to the selected partition.
                         */
                        saved_resultRelInfo = resultRelInfo;
-                       resultRelInfo = cstate->partitions[leaf_part_index];
+                       resultRelInfo = mtstate->mt_partitions[leaf_part_index];
+                       Assert(resultRelInfo != NULL);
 
                        /* We do not yet have a way to insert into a foreign 
partition */
                        if (resultRelInfo->ri_FdwRoutine)
@@ -2671,7 +2673,7 @@ CopyFrom(CopyState cstate)
                                         */
                                        
cstate->transition_capture->tcs_original_insert_tuple = NULL;
                                        cstate->transition_capture->tcs_map =
-                                               
cstate->transition_tupconv_maps[leaf_part_index];
+                                               
mtstate->mt_transition_tupconv_maps[leaf_part_index];
                                }
                                else
                                {
@@ -2688,7 +2690,7 @@ CopyFrom(CopyState cstate)
                         * We might need to convert from the parent rowtype to 
the
                         * partition rowtype.
                         */
-                       map = cstate->partition_tupconv_maps[leaf_part_index];
+                       map = 
mtstate->mt_partition_tupconv_maps[leaf_part_index];
                        if (map)
                        {
                                Relation        partrel = 
resultRelInfo->ri_RelationDesc;
@@ -2700,7 +2702,7 @@ CopyFrom(CopyState cstate)
                                 * point on.  Use a dedicated slot from this 
point on until
                                 * we're finished dealing with the partition.
                                 */
-                               slot = cstate->partition_tuple_slot;
+                               slot = mtstate->mt_partition_tuple_slot;
                                Assert(slot != NULL);
                                ExecSetSlotDescriptor(slot, 
RelationGetDescr(partrel));
                                ExecStoreTuple(tuple, slot, InvalidBuffer, 
true);
@@ -2852,7 +2854,7 @@ CopyFrom(CopyState cstate)
        ExecCloseIndices(resultRelInfo);
 
        /* Close all the partitioned tables, leaf partitions, and their indices 
*/
-       if (cstate->partition_dispatch_info)
+       if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
        {
                int                     i;
 
@@ -2862,23 +2864,26 @@ CopyFrom(CopyState cstate)
                 * the main target table of COPY that will be closed eventually 
by
                 * DoCopy().  Also, tupslot is NULL for the root partitioned 
table.
                 */
-               for (i = 1; i < cstate->num_dispatch; i++)
+               for (i = 1; i < mtstate->mt_num_dispatch; i++)
                {
-                       PartitionDispatch pd = 
cstate->partition_dispatch_info[i];
+                       PartitionDispatch pd = 
mtstate->mt_partition_dispatch_info[i];
 
                        heap_close(pd->reldesc, NoLock);
                        ExecDropSingleTupleTableSlot(pd->tupslot);
                }
-               for (i = 0; i < cstate->num_partitions; i++)
+               for (i = 0; i < mtstate->mt_num_partitions; i++)
                {
-                       ResultRelInfo *resultRelInfo = cstate->partitions[i];
+                       ResultRelInfo *resultRelInfo = 
mtstate->mt_partitions[i];
 
-                       ExecCloseIndices(resultRelInfo);
-                       heap_close(resultRelInfo->ri_RelationDesc, NoLock);
+                       if (resultRelInfo)
+                       {
+                               ExecCloseIndices(resultRelInfo);
+                               heap_close(resultRelInfo->ri_RelationDesc, 
NoLock);
+                       }
                }
 
                /* Release the standalone partition tuple descriptor */
-               ExecDropSingleTupleTableSlot(cstate->partition_tuple_slot);
+               ExecDropSingleTupleTableSlot(mtstate->mt_partition_tuple_slot);
        }
 
        /* Close any trigger target relations */
-- 
2.11.0

From 3e251d46de5105581acf620773568bb9cdecdf0b Mon Sep 17 00:00:00 2001
From: amit <amitlangot...@gmail.com>
Date: Tue, 19 Dec 2017 13:56:25 +0900
Subject: [PATCH 2/4] ExecFindPartition refactoring

---
 src/backend/commands/copy.c            |  5 +----
 src/backend/executor/execPartition.c   | 14 ++++++--------
 src/backend/executor/nodeModifyTable.c |  5 +----
 src/include/executor/execPartition.h   |  5 +----
 4 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index c82103e1c5..280d449dec 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2619,10 +2619,7 @@ CopyFrom(CopyState cstate)
                         * will get us the ResultRelInfo and TupleConversionMap 
for the
                         * partition, respectively.
                         */
-                       leaf_part_index = ExecFindPartition(resultRelInfo,
-                                                                               
mtstate->mt_partition_dispatch_info,
-                                                                               
                slot,
-                                                                               
                estate);
+                       leaf_part_index = ExecFindPartition(mtstate, slot);
                        Assert(leaf_part_index >= 0 &&
                                   leaf_part_index < 
mtstate->mt_num_partitions);
 
diff --git a/src/backend/executor/execPartition.c 
b/src/backend/executor/execPartition.c
index d545af2b67..a40c174230 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -155,11 +155,7 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 }
 
 /*
- * ExecFindPartition -- Find a leaf partition in the partition tree rooted
- * at parent, for the heap tuple contained in *slot
- *
- * estate must be non-NULL; we'll need it to compute any expressions in the
- * partition key(s)
+ * ExecFindPartition -- Find a leaf partition for tuple contained in slot
  *
  * If no leaf partition is found, this routine errors out with the appropriate
  * error message, else it returns the leaf partition sequence number
@@ -167,14 +163,16 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
  * the partition tree.
  */
 int
-ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
-                                 TupleTableSlot *slot, EState *estate)
+ExecFindPartition(ModifyTableState *mtstate, TupleTableSlot *slot)
 {
+       EState     *estate = mtstate->ps.state;
        int                     result;
        Datum           values[PARTITION_MAX_KEYS];
        bool            isnull[PARTITION_MAX_KEYS];
        Relation        rel;
-       PartitionDispatch parent;
+       PartitionDispatch  *pd = mtstate->mt_partition_dispatch_info,
+                                               parent;
+       ResultRelInfo *resultRelInfo = mtstate->resultRelInfo;
        ExprContext *ecxt = GetPerTupleExprContext(estate);
        TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
 
diff --git a/src/backend/executor/nodeModifyTable.c 
b/src/backend/executor/nodeModifyTable.c
index afb83ed3ae..f836dd3703 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -292,10 +292,7 @@ ExecInsert(ModifyTableState *mtstate,
                 * the ResultRelInfo and TupleConversionMap for the partition,
                 * respectively.
                 */
-               leaf_part_index = ExecFindPartition(resultRelInfo,
-                                                                               
        mtstate->mt_partition_dispatch_info,
-                                                                               
        slot,
-                                                                               
        estate);
+               leaf_part_index = ExecFindPartition(mtstate, slot);
                Assert(leaf_part_index >= 0 &&
                           leaf_part_index < mtstate->mt_num_partitions);
 
diff --git a/src/include/executor/execPartition.h 
b/src/include/executor/execPartition.h
index 86a199d169..19e3b9d233 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -58,9 +58,6 @@ extern void ExecSetupPartitionTupleRouting(ModifyTableState 
*mtstate,
                                                           TupleConversionMap 
***tup_conv_maps,
                                                           TupleTableSlot 
**partition_tuple_slot,
                                                           int *num_parted, int 
*num_partitions);
-extern int ExecFindPartition(ResultRelInfo *resultRelInfo,
-                                 PartitionDispatch *pd,
-                                 TupleTableSlot *slot,
-                                 EState *estate);
+extern int ExecFindPartition(ModifyTableState *mtstate, TupleTableSlot *slot);
 
 #endif                                                 /* EXECPARTITION_H */
-- 
2.11.0

From 6ea3100c3df46ee131ea3d7590eaba378536c320 Mon Sep 17 00:00:00 2001
From: amit <amitlangot...@gmail.com>
Date: Tue, 19 Dec 2017 16:20:09 +0900
Subject: [PATCH 3/4] ExecSetupPartitionTupleRouting refactoring

---
 src/backend/commands/copy.c            | 22 +----------
 src/backend/executor/execPartition.c   | 69 +++++++++++++++-------------------
 src/backend/executor/nodeModifyTable.c | 25 +-----------
 src/include/executor/execPartition.h   |  9 +----
 4 files changed, 33 insertions(+), 92 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 280d449dec..e7fe020fa7 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2466,12 +2466,6 @@ CopyFrom(CopyState cstate)
         */
        if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
        {
-               PartitionDispatch *partition_dispatch_info;
-               ResultRelInfo **partitions;
-               TupleConversionMap **partition_tupconv_maps;
-               TupleTableSlot *partition_tuple_slot;
-               int                     num_parted,
-                                       num_partitions;
                ModifyTable *node = makeNode(ModifyTable);
 
                /* Just need make this field appear valid. */
@@ -2479,21 +2473,7 @@ CopyFrom(CopyState cstate)
                mtstate->ps.plan = (Plan *) node;
                mtstate->ps.state = estate;
                mtstate->resultRelInfo = resultRelInfo;
-               ExecSetupPartitionTupleRouting(mtstate,
-                                                                          
cstate->rel,
-                                                                          
node->nominalRelation,
-                                                                          
estate,
-                                                                          
&partition_dispatch_info,
-                                                                          
&partitions,
-                                                                          
&partition_tupconv_maps,
-                                                                          
&partition_tuple_slot,
-                                                                          
&num_parted, &num_partitions);
-               mtstate->mt_partition_dispatch_info = partition_dispatch_info;
-               mtstate->mt_num_dispatch = num_parted;
-               mtstate->mt_partitions = partitions;
-               mtstate->mt_num_partitions = num_partitions;
-               mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
-               mtstate->mt_partition_tuple_slot = partition_tuple_slot;
+               ExecSetupPartitionTupleRouting(mtstate, cstate->rel);
 
                /*
                 * If we are capturing transition tuples, they may need to be
diff --git a/src/backend/executor/execPartition.c 
b/src/backend/executor/execPartition.c
index a40c174230..a495b165bd 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -41,42 +41,19 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation 
rel,
  * ExecSetupPartitionTupleRouting - set up information needed during
  * tuple routing for partitioned tables
  *
- * Output arguments:
- * 'pd' receives an array of PartitionDispatch objects with one entry for
- *             every partitioned table in the partition tree
- * 'partitions' receives an array of ResultRelInfo* objects with one entry for
- *             every leaf partition in the partition tree
- * 'tup_conv_maps' receives an array of TupleConversionMap objects with one
- *             entry for every leaf partition (required to convert input tuple 
based
- *             on the root table's rowtype to a leaf partition's rowtype after 
tuple
- *             routing is done)
- * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used
- *             to manipulate any given leaf partition's rowtype after that 
partition
- *             is chosen by tuple-routing.
- * 'num_parted' receives the number of partitioned tables in the partition
- *             tree (= the number of entries in the 'pd' output array)
- * 'num_partitions' receives the number of leaf partitions in the partition
- *             tree (= the number of entries in the 'partitions' and 
'tup_conv_maps'
- *             output arrays
- *
  * Note that all the relations in the partition tree are locked using the
  * RowExclusiveLock mode upon return from this function.
  */
 void
-ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
-                                                          Relation rel,
-                                                          Index resultRTindex,
-                                                          EState *estate,
-                                                          PartitionDispatch 
**pd,
-                                                          ResultRelInfo 
***partitions,
-                                                          TupleConversionMap 
***tup_conv_maps,
-                                                          TupleTableSlot 
**partition_tuple_slot,
-                                                          int *num_parted, int 
*num_partitions)
+ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
 {
        TupleDesc       tupDesc = RelationGetDescr(rel);
        List       *leaf_parts;
        ListCell   *cell;
        int                     i;
+       EState     *estate = mtstate->ps.state;
+       ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+       Index           resultRTindex = node->nominalRelation;
        ResultRelInfo *leaf_part_rri;
 
        /*
@@ -84,23 +61,35 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
         * partitions.
         */
        (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, 
NULL);
-       *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts);
-       *num_partitions = list_length(leaf_parts);
-       *partitions = (ResultRelInfo **) palloc(*num_partitions *
-                                                                               
        sizeof(ResultRelInfo *));
-       *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions *
-                                                                               
                         sizeof(TupleConversionMap *));
+       mtstate->mt_partition_dispatch_info =
+                               RelationGetPartitionDispatchInfo(rel,
+                                                                               
                 &mtstate->mt_num_dispatch,
+                                                                               
                 &leaf_parts);
+       mtstate->mt_num_partitions = list_length(leaf_parts);
 
        /*
+        * Allocate an array of ResultRelInfo pointers, but actual
+        * ResultRelInfo's will be allocated if and when needed.  See
+        * ExecFindPartition where it's done.
+        */
+       mtstate->mt_partitions = (ResultRelInfo **)
+                                                                               
 palloc0(sizeof(ResultRelInfo *) *
+                                                                               
                 mtstate->mt_num_partitions);
+       /* Ditto. */
+       mtstate->mt_partition_tupconv_maps =
+                                                       (TupleConversionMap **)
+                                                                               
palloc0(sizeof(TupleConversionMap *) *
+                                                                               
                mtstate->mt_num_partitions);
+       /*
         * Initialize an empty slot that will be used to manipulate tuples of 
any
         * given partition's rowtype.  It is attached to the caller-specified 
node
         * (such as ModifyTableState) and released when the node finishes
         * processing.
         */
-       *partition_tuple_slot = MakeTupleTableSlot();
+       mtstate->mt_partition_tuple_slot = MakeTupleTableSlot();
 
-       leaf_part_rri = (ResultRelInfo *) palloc0(*num_partitions *
-                                                                               
          sizeof(ResultRelInfo));
+       leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo) *
+                                                                               
          mtstate->mt_num_partitions);
        i = 0;
        foreach(cell, leaf_parts)
        {
@@ -119,8 +108,10 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
                 * Save a tuple conversion map to convert a tuple routed to this
                 * partition from the parent's type to the partition's.
                 */
-               (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, 
part_tupdesc,
-                                                                               
                         gettext_noop("could not convert row type"));
+               mtstate->mt_partition_tupconv_maps[i] =
+                                                               
convert_tuples_by_name(tupDesc,
+                                                                               
                           part_tupdesc,
+                                                                       
gettext_noop("could not convert row type"));
 
                InitResultRelInfo(leaf_part_rri,
                                                  partrel,
@@ -149,7 +140,7 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
                estate->es_leaf_result_relations =
                        lappend(estate->es_leaf_result_relations, 
leaf_part_rri);
 
-               (*partitions)[i] = leaf_part_rri++;
+               mtstate->mt_partitions[i] = leaf_part_rri++;
                i++;
        }
 }
diff --git a/src/backend/executor/nodeModifyTable.c 
b/src/backend/executor/nodeModifyTable.c
index f836dd3703..6a3b171587 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -1942,30 +1942,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, 
int eflags)
        /* Build state for INSERT tuple routing */
        if (operation == CMD_INSERT &&
                rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-       {
-               PartitionDispatch *partition_dispatch_info;
-               ResultRelInfo **partitions;
-               TupleConversionMap **partition_tupconv_maps;
-               TupleTableSlot *partition_tuple_slot;
-               int                     num_parted,
-                                       num_partitions;
-
-               ExecSetupPartitionTupleRouting(mtstate,
-                                                                          rel,
-                                                                          
node->nominalRelation,
-                                                                          
estate,
-                                                                          
&partition_dispatch_info,
-                                                                          
&partitions,
-                                                                          
&partition_tupconv_maps,
-                                                                          
&partition_tuple_slot,
-                                                                          
&num_parted, &num_partitions);
-               mtstate->mt_partition_dispatch_info = partition_dispatch_info;
-               mtstate->mt_num_dispatch = num_parted;
-               mtstate->mt_partitions = partitions;
-               mtstate->mt_num_partitions = num_partitions;
-               mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
-               mtstate->mt_partition_tuple_slot = partition_tuple_slot;
-       }
+               ExecSetupPartitionTupleRouting(mtstate, rel);
 
        /*
         * Build state for collecting transition tuples.  This requires having a
diff --git a/src/include/executor/execPartition.h 
b/src/include/executor/execPartition.h
index 19e3b9d233..c3ddf879b9 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -50,14 +50,7 @@ typedef struct PartitionDispatchData
 typedef struct PartitionDispatchData *PartitionDispatch;
 
 extern void ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
-                                                          Relation rel,
-                                                          Index resultRTindex,
-                                                          EState *estate,
-                                                          PartitionDispatch 
**pd,
-                                                          ResultRelInfo 
***partitions,
-                                                          TupleConversionMap 
***tup_conv_maps,
-                                                          TupleTableSlot 
**partition_tuple_slot,
-                                                          int *num_parted, int 
*num_partitions);
+                                                          Relation rel);
 extern int ExecFindPartition(ModifyTableState *mtstate, TupleTableSlot *slot);
 
 #endif                                                 /* EXECPARTITION_H */
-- 
2.11.0

From ed8469d38a0747fe1b3d1fb3bb8c45b4cb2a2b45 Mon Sep 17 00:00:00 2001
From: amit <amitlangot...@gmail.com>
Date: Wed, 1 Nov 2017 10:31:21 +0900
Subject: [PATCH 4/4] During tuple-routing, initialize per-partition objects
 lazily

Those objects include ResultRelInfo, tuple conversion map,
WITH CHECK OPTION quals and RETURNING projections.

This means we don't allocate these objects for partitions that are
never inserted into.
---
 src/backend/commands/copy.c            |  15 +--
 src/backend/executor/execPartition.c   | 225 ++++++++++++++++++++++++---------
 src/backend/executor/nodeModifyTable.c | 108 ++--------------
 src/include/nodes/execnodes.h          |   1 +
 4 files changed, 180 insertions(+), 169 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index e7fe020fa7..3674aea9b3 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2479,23 +2479,14 @@ CopyFrom(CopyState cstate)
                 * If we are capturing transition tuples, they may need to be
                 * converted from partition format back to partitioned table 
format
                 * (this is only ever necessary if a BEFORE trigger modifies the
-                * tuple).
+                * tuple).  Note that we don't allocate the actual maps here; 
they'll
+                * be allocated by ExecInitPartitionResultRelInfo() if and when
+                * needed.
                 */
                if (cstate->transition_capture != NULL)
-               {
-                       int                     i;
-
                        mtstate->mt_transition_tupconv_maps = 
(TupleConversionMap **)
                                                                                
palloc0(sizeof(TupleConversionMap *) *
                                                                                
                mtstate->mt_num_partitions);
-                       for (i = 0; i < mtstate->mt_num_partitions; ++i)
-                       {
-                               mtstate->mt_transition_tupconv_maps[i] =
-                                       
convert_tuples_by_name(RelationGetDescr(mtstate->mt_partitions[i]->ri_RelationDesc),
-                                                                               
   RelationGetDescr(cstate->rel),
-                                                                               
   gettext_noop("could not convert row type"));
-                       }
-               }
        }
 
        /*
diff --git a/src/backend/executor/execPartition.c 
b/src/backend/executor/execPartition.c
index a495b165bd..3e2226e5f8 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -23,6 +23,8 @@
 #include "utils/rls.h"
 #include "utils/ruleutils.h"
 
+static void ExecInitPartitionResultRelInfo(ModifyTableState *mtstate,
+                                       int partidx);
 static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel,
                                                                 int 
*num_parted, List **leaf_part_oids);
 static void get_partition_dispatch_recurse(Relation rel, Relation parent,
@@ -47,14 +49,9 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation 
rel,
 void
 ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
 {
-       TupleDesc       tupDesc = RelationGetDescr(rel);
        List       *leaf_parts;
        ListCell   *cell;
        int                     i;
-       EState     *estate = mtstate->ps.state;
-       ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
-       Index           resultRTindex = node->nominalRelation;
-       ResultRelInfo *leaf_part_rri;
 
        /*
         * Get the information about the partition tree after locking all the
@@ -66,6 +63,11 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, 
Relation rel)
                                                                                
                 &mtstate->mt_num_dispatch,
                                                                                
                 &leaf_parts);
        mtstate->mt_num_partitions = list_length(leaf_parts);
+       mtstate->mt_partition_oids = (Oid *) palloc0(sizeof(Oid) *
+                                                                               
                 mtstate->mt_num_partitions);
+       i = 0;
+       foreach (cell, leaf_parts)
+               mtstate->mt_partition_oids[i++] = lfirst_oid(cell);
 
        /*
         * Allocate an array of ResultRelInfo pointers, but actual
@@ -87,62 +89,6 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, 
Relation rel)
         * processing.
         */
        mtstate->mt_partition_tuple_slot = MakeTupleTableSlot();
-
-       leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo) *
-                                                                               
          mtstate->mt_num_partitions);
-       i = 0;
-       foreach(cell, leaf_parts)
-       {
-               Relation        partrel;
-               TupleDesc       part_tupdesc;
-
-               /*
-                * We locked all the partitions above including the leaf 
partitions.
-                * Note that each of the relations in *partitions are eventually
-                * closed by the caller.
-                */
-               partrel = heap_open(lfirst_oid(cell), NoLock);
-               part_tupdesc = RelationGetDescr(partrel);
-
-               /*
-                * Save a tuple conversion map to convert a tuple routed to this
-                * partition from the parent's type to the partition's.
-                */
-               mtstate->mt_partition_tupconv_maps[i] =
-                                                               
convert_tuples_by_name(tupDesc,
-                                                                               
                           part_tupdesc,
-                                                                       
gettext_noop("could not convert row type"));
-
-               InitResultRelInfo(leaf_part_rri,
-                                                 partrel,
-                                                 resultRTindex,
-                                                 rel,
-                                                 estate->es_instrument);
-
-               /*
-                * Verify result relation is a valid target for INSERT.
-                */
-               CheckValidResultRel(leaf_part_rri, CMD_INSERT);
-
-               /*
-                * Open partition indices.  The user may have asked to check for
-                * conflicts within this leaf partition and do "nothing" 
instead of
-                * throwing an error.  Be prepared in that case by initializing 
the
-                * index information needed by ExecInsert() to perform 
speculative
-                * insertions.
-                */
-               if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex &&
-                       leaf_part_rri->ri_IndexRelationDescs == NULL)
-                       ExecOpenIndices(leaf_part_rri,
-                                                       mtstate != NULL &&
-                                                       mtstate->mt_onconflict 
!= ONCONFLICT_NONE);
-
-               estate->es_leaf_result_relations =
-                       lappend(estate->es_leaf_result_relations, 
leaf_part_rri);
-
-               mtstate->mt_partitions[i] = leaf_part_rri++;
-               i++;
-       }
 }
 
 /*
@@ -257,11 +203,168 @@ ExecFindPartition(ModifyTableState *mtstate, 
TupleTableSlot *slot)
                                 val_desc ? errdetail("Partition key of the 
failing row contains %s.", val_desc) : 0));
        }
 
+       /* Initialize the partition result rel, if not done already. */
+       ExecInitPartitionResultRelInfo(mtstate, result);
        ecxt->ecxt_scantuple = ecxt_scantuple_old;
        return result;
 }
 
 /*
+ * ExecInitPartitionResultRelInfo
+ *             Initialize ResultRelInfo for a partition if not done already
+ */
+static void
+ExecInitPartitionResultRelInfo(ModifyTableState *mtstate, int partidx)
+{
+       EState     *estate = mtstate->ps.state;
+       Relation        rootrel = mtstate->resultRelInfo->ri_RelationDesc;
+       Index           resultRTindex = 
mtstate->resultRelInfo->ri_RangeTableIndex;
+       ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+       Relation        partrel;
+       TupleDesc       tupDesc = RelationGetDescr(rootrel),
+                               part_tupdesc;
+
+       /* Nothing to do if already set.*/
+       if (mtstate->mt_partitions[partidx])
+               return;
+
+       mtstate->mt_partitions[partidx] = (ResultRelInfo *)
+                                                                               
        palloc0(sizeof(ResultRelInfo));
+
+       /*
+        * We locked all the partitions in ExecSetupPartitionTupleRouting
+        * including the leaf partitions.
+        */
+       partrel = heap_open(mtstate->mt_partition_oids[partidx], NoLock);
+       part_tupdesc = RelationGetDescr(partrel);
+       InitResultRelInfo(mtstate->mt_partitions[partidx],
+                                         partrel,
+                                         resultRTindex,
+                                         rootrel,
+                                         estate->es_instrument);
+
+       /*
+        * Verify result relation is a valid target for INSERT.
+        */
+       CheckValidResultRel(mtstate->mt_partitions[partidx], CMD_INSERT);
+
+       /*
+        * Open partition indices.  The user may have asked to check for
+        * conflicts within this leaf partition and do "nothing" instead of
+        * throwing an error.  Be prepared in that case by initializing the
+        * index information needed by ExecInsert() to perform speculative
+        * insertions.
+        */
+       if (partrel->rd_rel->relhasindex &&
+               mtstate->mt_partitions[partidx]->ri_IndexRelationDescs == NULL)
+               ExecOpenIndices(mtstate->mt_partitions[partidx],
+                                               mtstate->mt_onconflict != 
ONCONFLICT_NONE);
+
+       /*
+        * Save a tuple conversion map to convert a tuple routed to this
+        * partition from the parent's type to the partition's.
+        */
+       mtstate->mt_partition_tupconv_maps[partidx] =
+                                                       
convert_tuples_by_name(tupDesc, part_tupdesc,
+                                                                               
           gettext_noop("could not convert row type"));
+
+       /*
+        * Also, if needed, the map to convert from partition's rowtype to the
+        * parent's that is needed to store the partition's tuples into the
+        * transition tuplestore which only accepts tuples of parent's rowtype.
+        */
+       if (mtstate->mt_transition_tupconv_maps)
+               mtstate->mt_transition_tupconv_maps[partidx] =
+                                                       
convert_tuples_by_name(part_tupdesc, tupDesc,
+                                                                               
           gettext_noop("could not convert row type"));
+
+       /*
+        * Build WITH CHECK OPTION constraints for each leaf partition rel. Note
+        * that we didn't build the withCheckOptionList for each partition 
within
+        * the planner, but simple translation of the varattnos for each 
partition
+        * will suffice.  This only occurs for the INSERT case; UPDATE/DELETE
+        * cases are handled above.
+        */
+       if (node && node->withCheckOptionLists != NIL)
+       {
+               List       *wcoList;
+               List       *mapped_wcoList;
+               List       *wcoExprs = NIL;
+               ListCell   *ll;
+
+               /*
+                * In case of INSERT on partitioned tables, there is only one 
plan.
+                * Likewise, there is only one WITH CHECK OPTIONS list, not one 
per
+                * partition.  We make a copy of the WCO qual for each 
partition; note
+                * that, if there are SubPlans in there, they all end up 
attached to
+                * the one parent Plan node.
+                */
+               Assert(mtstate->operation == CMD_INSERT &&
+                          list_length(node->withCheckOptionLists) == 1 &&
+                          mtstate->mt_nplans == 1);
+               wcoList = linitial(node->withCheckOptionLists);
+               mapped_wcoList = map_partition_varattnos(wcoList,
+                                                                               
                 resultRTindex,
+                                                                               
                 partrel, rootrel, NULL);
+               foreach(ll, mapped_wcoList)
+               {
+                       WithCheckOption *wco = castNode(WithCheckOption, 
lfirst(ll));
+                       ExprState  *wcoExpr = ExecInitQual(castNode(List, 
wco->qual),
+                                                                               
           mtstate->mt_plans[0]);
+                       wcoExprs = lappend(wcoExprs, wcoExpr);
+               }
+
+               mtstate->mt_partitions[partidx]->ri_WithCheckOptions = 
mapped_wcoList;
+               mtstate->mt_partitions[partidx]->ri_WithCheckOptionExprs = 
wcoExprs;
+       }
+
+       /*
+        * Build a projection for each leaf partition rel.  Note that we
+        * didn't build the returningList for each partition within the
+        * planner, but simple translation of the varattnos for each partition
+        * will suffice.  This only occurs for the INSERT case; UPDATE/DELETE
+        * are handled above.
+        */
+       if (node && node->returningLists != NIL)
+       {
+               TupleTableSlot *slot;
+               ExprContext *econtext;
+               List       *returningList;
+               List       *rlist;
+
+               returningList = linitial(node->returningLists);
+
+               /*
+                * Initialize result tuple slot and assign its rowtype using 
the first
+                * RETURNING list.  We assume the rest will look the same.
+                */
+               tupDesc = ExecTypeFromTL(returningList, false);
+
+               /* Set up a slot for the output of the RETURNING projection(s) 
*/
+               ExecInitResultTupleSlot(estate, &mtstate->ps);
+               ExecAssignResultType(&mtstate->ps, tupDesc);
+               slot = mtstate->ps.ps_ResultTupleSlot;
+
+               /* Need an econtext too */
+               if (mtstate->ps.ps_ExprContext == NULL)
+                       ExecAssignExprContext(estate, &mtstate->ps);
+               econtext = mtstate->ps.ps_ExprContext;
+
+               rlist = map_partition_varattnos(returningList,
+                                                                               
resultRTindex,
+                                                                               
partrel, rootrel, NULL);
+               mtstate->mt_partitions[partidx]->ri_projectReturning =
+                               ExecBuildProjectionInfo(rlist, econtext, slot, 
&mtstate->ps,
+                                                                               
part_tupdesc);
+       }
+
+       /* Note that the entries in this list appear in no predetermined order. 
*/
+       estate->es_leaf_result_relations =
+                                                               
lappend(estate->es_leaf_result_relations,
+                                                                               
mtstate->mt_partitions[partidx]);
+}
+
+/*
  * RelationGetPartitionDispatchInfo
  *             Returns information necessary to route tuples down a partition 
tree
  *
diff --git a/src/backend/executor/nodeModifyTable.c 
b/src/backend/executor/nodeModifyTable.c
index 6a3b171587..8b45fdaeb7 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -1511,23 +1511,14 @@ ExecSetupTransitionCaptureState(ModifyTableState 
*mtstate, EState *estate)
                mtstate->mt_transition_tupconv_maps = (TupleConversionMap **)
                        palloc0(sizeof(TupleConversionMap *) * 
numResultRelInfos);
 
-               /* Choose the right set of partitions */
+               /*
+                * If partition tuple-routing is active, we can't have partition
+                * ResultRelInfo's just yet, so return in that case.  Instead,
+                * the conversion map will be initialized in
+                * ExecInitPartitionResultRelInfo() if and when needed.
+                */
                if (mtstate->mt_partition_dispatch_info != NULL)
-               {
-                       /*
-                        * For tuple routing among partitions, we need 
TupleDescs based on
-                        * the partition routing table.
-                        */
-                       ResultRelInfo **resultRelInfos = mtstate->mt_partitions;
-
-                       for (i = 0; i < numResultRelInfos; ++i)
-                       {
-                               mtstate->mt_transition_tupconv_maps[i] =
-                                       
convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc),
-                                                                               
   RelationGetDescr(targetRelInfo->ri_RelationDesc),
-                                                                               
   gettext_noop("could not convert row type"));
-                       }
-               }
+                       return;
                else
                {
                        /* Otherwise we need the ResultRelInfo for each 
subplan. */
@@ -1978,65 +1969,12 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, 
int eflags)
        }
 
        /*
-        * Build WITH CHECK OPTION constraints for each leaf partition rel. Note
-        * that we didn't build the withCheckOptionList for each partition 
within
-        * the planner, but simple translation of the varattnos for each 
partition
-        * will suffice.  This only occurs for the INSERT case; UPDATE/DELETE
-        * cases are handled above.
-        */
-       if (node->withCheckOptionLists != NIL && mtstate->mt_num_partitions > 0)
-       {
-               List       *wcoList;
-               PlanState  *plan;
-
-               /*
-                * In case of INSERT on partitioned tables, there is only one 
plan.
-                * Likewise, there is only one WITH CHECK OPTIONS list, not one 
per
-                * partition.  We make a copy of the WCO qual for each 
partition; note
-                * that, if there are SubPlans in there, they all end up 
attached to
-                * the one parent Plan node.
-                */
-               Assert(operation == CMD_INSERT &&
-                          list_length(node->withCheckOptionLists) == 1 &&
-                          mtstate->mt_nplans == 1);
-               wcoList = linitial(node->withCheckOptionLists);
-               plan = mtstate->mt_plans[0];
-               for (i = 0; i < mtstate->mt_num_partitions; i++)
-               {
-                       Relation        partrel;
-                       List       *mapped_wcoList;
-                       List       *wcoExprs = NIL;
-                       ListCell   *ll;
-
-                       resultRelInfo = mtstate->mt_partitions[i];
-                       partrel = resultRelInfo->ri_RelationDesc;
-
-                       /* varno = node->nominalRelation */
-                       mapped_wcoList = map_partition_varattnos(wcoList,
-                                                                               
                         node->nominalRelation,
-                                                                               
                         partrel, rel, NULL);
-                       foreach(ll, mapped_wcoList)
-                       {
-                               WithCheckOption *wco = 
castNode(WithCheckOption, lfirst(ll));
-                               ExprState  *wcoExpr = 
ExecInitQual(castNode(List, wco->qual),
-                                                                               
                   plan);
-
-                               wcoExprs = lappend(wcoExprs, wcoExpr);
-                       }
-
-                       resultRelInfo->ri_WithCheckOptions = mapped_wcoList;
-                       resultRelInfo->ri_WithCheckOptionExprs = wcoExprs;
-               }
-       }
-
-       /*
         * Initialize RETURNING projections if needed.
         */
        if (node->returningLists)
        {
                TupleTableSlot *slot;
                ExprContext *econtext;
-               List       *returningList;
 
                /*
                 * Initialize result tuple slot and assign its rowtype using 
the first
@@ -2068,31 +2006,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, 
int eflags)
                                                                                
resultRelInfo->ri_RelationDesc->rd_att);
                        resultRelInfo++;
                }
-
-               /*
-                * Build a projection for each leaf partition rel.  Note that we
-                * didn't build the returningList for each partition within the
-                * planner, but simple translation of the varattnos for each 
partition
-                * will suffice.  This only occurs for the INSERT case; 
UPDATE/DELETE
-                * are handled above.
-                */
-               returningList = linitial(node->returningLists);
-               for (i = 0; i < mtstate->mt_num_partitions; i++)
-               {
-                       Relation        partrel;
-                       List       *rlist;
-
-                       resultRelInfo = mtstate->mt_partitions[i];
-                       partrel = resultRelInfo->ri_RelationDesc;
-
-                       /* varno = node->nominalRelation */
-                       rlist = map_partition_varattnos(returningList,
-                                                                               
        node->nominalRelation,
-                                                                               
        partrel, rel, NULL);
-                       resultRelInfo->ri_projectReturning =
-                               ExecBuildProjectionInfo(rlist, econtext, slot, 
&mtstate->ps,
-                                                                               
resultRelInfo->ri_RelationDesc->rd_att);
-               }
        }
        else
        {
@@ -2367,8 +2280,11 @@ ExecEndModifyTable(ModifyTableState *node)
        {
                ResultRelInfo *resultRelInfo = node->mt_partitions[i];
 
-               ExecCloseIndices(resultRelInfo);
-               heap_close(resultRelInfo->ri_RelationDesc, NoLock);
+               if (resultRelInfo)
+               {
+                       ExecCloseIndices(resultRelInfo);
+                       heap_close(resultRelInfo->ri_RelationDesc, NoLock);
+               }
        }
 
        /* Release the standalone partition tuple descriptor, if any */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 1a35c5c9ad..988a374a74 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -982,6 +982,7 @@ typedef struct ModifyTableState
        int                     mt_num_dispatch;        /* Number of entries in 
the above array */
        int                     mt_num_partitions;      /* Number of members in 
the following
                                                                         * 
arrays */
+       Oid                *mt_partition_oids;  /* Per partition OIDs */
        ResultRelInfo **mt_partitions;  /* Per partition result relation 
pointers */
        TupleConversionMap **mt_partition_tupconv_maps;
        /* Per partition tuple conversion map */
-- 
2.11.0

Reply via email to