Hi. I have a patch that rearranges the code around partition tuple-routing, such that allocation of per-partition objects (ResultRelInfo, TupleConversionMap, etc.) is delayed until a given partition is actually inserted into (i.e., a tuple is routed to it). I can see good win for non-bulk inserts with the patch and the patch is implemented such that it doesn't affect the bulk-insert case much.
Performance numbers: * Uses following hash-partitioned table: create table t1 (a int, b int) partition by hash (a); create table t1_x partition of t1 for values with (modulus M, remainder R) ... * Non-bulk insert uses the following code (insert 100,000 rows one-by-one): do $$ begin for i in 1..100000 loop insert into t1 values (i, i+1); end loop; end; $$; * Times in milliseconds: #parts HEAD Patched 8 6216.300 4977.670 16 9061.388 6360.093 32 14081.656 8752.405 64 24887.110 13919.384 128 45926.251 24582.411 256 88088.084 45490.894 As you can see the performance can be as much as 2x faster with the patch, although time taken still increases as the number of partitions increases, because we still lock *all* partitions at the beginning. * Bulk-inserting 100,000 rows using COPY: copy t1 from '/tmp/t1.csv' csv; * Times in milliseconds: #parts HEAD Patched 8 458.301 450.875 16 409.271 510.723 32 500.960 612.003 64 430.687 795.046 128 449.314 565.786 256 493.171 490.187 Not much harm here, although numbers are a bit noisy. Patch is divided into 4, first 3 of which are refactoring patches. I know this patch will conflict severely with [1] and [2], so it's fine if we consider applying these later. Will add this to next CF. Thanks, Amit [1] https://commitfest.postgresql.org/16/1023/ [2] https://commitfest.postgresql.org/16/1184/
From a87be8a84d467d65cc0b6cf02655fc3b2b9a458f Mon Sep 17 00:00:00 2001 From: amit <amitlangot...@gmail.com> Date: Tue, 19 Dec 2017 10:43:45 +0900 Subject: [PATCH 1/4] Teach CopyFrom to use ModifyTableState for tuple-routing This removes all fields of CopyStateData that were meant for tuple routing and instead uses ModifyTableState that has all those fields, including transition_tupconv_maps. In COPY's case, transition_tupconv_maps is only required if tuple routing is being used, so it's safe. --- src/backend/commands/copy.c | 79 ++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 254be28ae4..c82103e1c5 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -166,14 +166,7 @@ typedef struct CopyStateData bool volatile_defexprs; /* is any of defexprs volatile? */ List *range_table; - PartitionDispatch *partition_dispatch_info; - int num_dispatch; /* Number of entries in the above array */ - int num_partitions; /* Number of members in the following arrays */ - ResultRelInfo **partitions; /* Per partition result relation pointers */ - TupleConversionMap **partition_tupconv_maps; - TupleTableSlot *partition_tuple_slot; TransitionCaptureState *transition_capture; - TupleConversionMap **transition_tupconv_maps; /* * These variables are used to reduce overhead in textual COPY FROM. @@ -2289,6 +2282,7 @@ CopyFrom(CopyState cstate) ResultRelInfo *resultRelInfo; ResultRelInfo *saved_resultRelInfo = NULL; EState *estate = CreateExecutorState(); /* for ExecConstraints() */ + ModifyTableState *mtstate = makeNode(ModifyTableState); ExprContext *econtext; TupleTableSlot *myslot; MemoryContext oldcontext = CurrentMemoryContext; @@ -2478,22 +2472,28 @@ CopyFrom(CopyState cstate) TupleTableSlot *partition_tuple_slot; int num_parted, num_partitions; - - ExecSetupPartitionTupleRouting(NULL, + ModifyTable *node = makeNode(ModifyTable); + + /* Just need make this field appear valid. */ + node->nominalRelation = 1; + mtstate->ps.plan = (Plan *) node; + mtstate->ps.state = estate; + mtstate->resultRelInfo = resultRelInfo; + ExecSetupPartitionTupleRouting(mtstate, cstate->rel, - 1, + node->nominalRelation, estate, &partition_dispatch_info, &partitions, &partition_tupconv_maps, &partition_tuple_slot, &num_parted, &num_partitions); - cstate->partition_dispatch_info = partition_dispatch_info; - cstate->num_dispatch = num_parted; - cstate->partitions = partitions; - cstate->num_partitions = num_partitions; - cstate->partition_tupconv_maps = partition_tupconv_maps; - cstate->partition_tuple_slot = partition_tuple_slot; + mtstate->mt_partition_dispatch_info = partition_dispatch_info; + mtstate->mt_num_dispatch = num_parted; + mtstate->mt_partitions = partitions; + mtstate->mt_num_partitions = num_partitions; + mtstate->mt_partition_tupconv_maps = partition_tupconv_maps; + mtstate->mt_partition_tuple_slot = partition_tuple_slot; /* * If we are capturing transition tuples, they may need to be @@ -2505,12 +2505,13 @@ CopyFrom(CopyState cstate) { int i; - cstate->transition_tupconv_maps = (TupleConversionMap **) - palloc0(sizeof(TupleConversionMap *) * cstate->num_partitions); - for (i = 0; i < cstate->num_partitions; ++i) + mtstate->mt_transition_tupconv_maps = (TupleConversionMap **) + palloc0(sizeof(TupleConversionMap *) * + mtstate->mt_num_partitions); + for (i = 0; i < mtstate->mt_num_partitions; ++i) { - cstate->transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(cstate->partitions[i]->ri_RelationDesc), + mtstate->mt_transition_tupconv_maps[i] = + convert_tuples_by_name(RelationGetDescr(mtstate->mt_partitions[i]->ri_RelationDesc), RelationGetDescr(cstate->rel), gettext_noop("could not convert row type")); } @@ -2530,7 +2531,7 @@ CopyFrom(CopyState cstate) if ((resultRelInfo->ri_TrigDesc != NULL && (resultRelInfo->ri_TrigDesc->trig_insert_before_row || resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) || - cstate->partition_dispatch_info != NULL || + mtstate->mt_partition_dispatch_info != NULL || cstate->volatile_defexprs) { useHeapMultiInsert = false; @@ -2605,7 +2606,7 @@ CopyFrom(CopyState cstate) ExecStoreTuple(tuple, slot, InvalidBuffer, false); /* Determine the partition to heap_insert the tuple into */ - if (cstate->partition_dispatch_info) + if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { int leaf_part_index; TupleConversionMap *map; @@ -2619,11 +2620,11 @@ CopyFrom(CopyState cstate) * partition, respectively. */ leaf_part_index = ExecFindPartition(resultRelInfo, - cstate->partition_dispatch_info, + mtstate->mt_partition_dispatch_info, slot, estate); Assert(leaf_part_index >= 0 && - leaf_part_index < cstate->num_partitions); + leaf_part_index < mtstate->mt_num_partitions); /* * If this tuple is mapped to a partition that is not same as the @@ -2641,7 +2642,8 @@ CopyFrom(CopyState cstate) * to the selected partition. */ saved_resultRelInfo = resultRelInfo; - resultRelInfo = cstate->partitions[leaf_part_index]; + resultRelInfo = mtstate->mt_partitions[leaf_part_index]; + Assert(resultRelInfo != NULL); /* We do not yet have a way to insert into a foreign partition */ if (resultRelInfo->ri_FdwRoutine) @@ -2671,7 +2673,7 @@ CopyFrom(CopyState cstate) */ cstate->transition_capture->tcs_original_insert_tuple = NULL; cstate->transition_capture->tcs_map = - cstate->transition_tupconv_maps[leaf_part_index]; + mtstate->mt_transition_tupconv_maps[leaf_part_index]; } else { @@ -2688,7 +2690,7 @@ CopyFrom(CopyState cstate) * We might need to convert from the parent rowtype to the * partition rowtype. */ - map = cstate->partition_tupconv_maps[leaf_part_index]; + map = mtstate->mt_partition_tupconv_maps[leaf_part_index]; if (map) { Relation partrel = resultRelInfo->ri_RelationDesc; @@ -2700,7 +2702,7 @@ CopyFrom(CopyState cstate) * point on. Use a dedicated slot from this point on until * we're finished dealing with the partition. */ - slot = cstate->partition_tuple_slot; + slot = mtstate->mt_partition_tuple_slot; Assert(slot != NULL); ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); ExecStoreTuple(tuple, slot, InvalidBuffer, true); @@ -2852,7 +2854,7 @@ CopyFrom(CopyState cstate) ExecCloseIndices(resultRelInfo); /* Close all the partitioned tables, leaf partitions, and their indices */ - if (cstate->partition_dispatch_info) + if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { int i; @@ -2862,23 +2864,26 @@ CopyFrom(CopyState cstate) * the main target table of COPY that will be closed eventually by * DoCopy(). Also, tupslot is NULL for the root partitioned table. */ - for (i = 1; i < cstate->num_dispatch; i++) + for (i = 1; i < mtstate->mt_num_dispatch; i++) { - PartitionDispatch pd = cstate->partition_dispatch_info[i]; + PartitionDispatch pd = mtstate->mt_partition_dispatch_info[i]; heap_close(pd->reldesc, NoLock); ExecDropSingleTupleTableSlot(pd->tupslot); } - for (i = 0; i < cstate->num_partitions; i++) + for (i = 0; i < mtstate->mt_num_partitions; i++) { - ResultRelInfo *resultRelInfo = cstate->partitions[i]; + ResultRelInfo *resultRelInfo = mtstate->mt_partitions[i]; - ExecCloseIndices(resultRelInfo); - heap_close(resultRelInfo->ri_RelationDesc, NoLock); + if (resultRelInfo) + { + ExecCloseIndices(resultRelInfo); + heap_close(resultRelInfo->ri_RelationDesc, NoLock); + } } /* Release the standalone partition tuple descriptor */ - ExecDropSingleTupleTableSlot(cstate->partition_tuple_slot); + ExecDropSingleTupleTableSlot(mtstate->mt_partition_tuple_slot); } /* Close any trigger target relations */ -- 2.11.0
From 3e251d46de5105581acf620773568bb9cdecdf0b Mon Sep 17 00:00:00 2001 From: amit <amitlangot...@gmail.com> Date: Tue, 19 Dec 2017 13:56:25 +0900 Subject: [PATCH 2/4] ExecFindPartition refactoring --- src/backend/commands/copy.c | 5 +---- src/backend/executor/execPartition.c | 14 ++++++-------- src/backend/executor/nodeModifyTable.c | 5 +---- src/include/executor/execPartition.h | 5 +---- 4 files changed, 9 insertions(+), 20 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index c82103e1c5..280d449dec 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2619,10 +2619,7 @@ CopyFrom(CopyState cstate) * will get us the ResultRelInfo and TupleConversionMap for the * partition, respectively. */ - leaf_part_index = ExecFindPartition(resultRelInfo, - mtstate->mt_partition_dispatch_info, - slot, - estate); + leaf_part_index = ExecFindPartition(mtstate, slot); Assert(leaf_part_index >= 0 && leaf_part_index < mtstate->mt_num_partitions); diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index d545af2b67..a40c174230 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -155,11 +155,7 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, } /* - * ExecFindPartition -- Find a leaf partition in the partition tree rooted - * at parent, for the heap tuple contained in *slot - * - * estate must be non-NULL; we'll need it to compute any expressions in the - * partition key(s) + * ExecFindPartition -- Find a leaf partition for tuple contained in slot * * If no leaf partition is found, this routine errors out with the appropriate * error message, else it returns the leaf partition sequence number @@ -167,14 +163,16 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, * the partition tree. */ int -ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, - TupleTableSlot *slot, EState *estate) +ExecFindPartition(ModifyTableState *mtstate, TupleTableSlot *slot) { + EState *estate = mtstate->ps.state; int result; Datum values[PARTITION_MAX_KEYS]; bool isnull[PARTITION_MAX_KEYS]; Relation rel; - PartitionDispatch parent; + PartitionDispatch *pd = mtstate->mt_partition_dispatch_info, + parent; + ResultRelInfo *resultRelInfo = mtstate->resultRelInfo; ExprContext *ecxt = GetPerTupleExprContext(estate); TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index afb83ed3ae..f836dd3703 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -292,10 +292,7 @@ ExecInsert(ModifyTableState *mtstate, * the ResultRelInfo and TupleConversionMap for the partition, * respectively. */ - leaf_part_index = ExecFindPartition(resultRelInfo, - mtstate->mt_partition_dispatch_info, - slot, - estate); + leaf_part_index = ExecFindPartition(mtstate, slot); Assert(leaf_part_index >= 0 && leaf_part_index < mtstate->mt_num_partitions); diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index 86a199d169..19e3b9d233 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -58,9 +58,6 @@ extern void ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, TupleConversionMap ***tup_conv_maps, TupleTableSlot **partition_tuple_slot, int *num_parted, int *num_partitions); -extern int ExecFindPartition(ResultRelInfo *resultRelInfo, - PartitionDispatch *pd, - TupleTableSlot *slot, - EState *estate); +extern int ExecFindPartition(ModifyTableState *mtstate, TupleTableSlot *slot); #endif /* EXECPARTITION_H */ -- 2.11.0
From 6ea3100c3df46ee131ea3d7590eaba378536c320 Mon Sep 17 00:00:00 2001 From: amit <amitlangot...@gmail.com> Date: Tue, 19 Dec 2017 16:20:09 +0900 Subject: [PATCH 3/4] ExecSetupPartitionTupleRouting refactoring --- src/backend/commands/copy.c | 22 +---------- src/backend/executor/execPartition.c | 69 +++++++++++++++------------------- src/backend/executor/nodeModifyTable.c | 25 +----------- src/include/executor/execPartition.h | 9 +---- 4 files changed, 33 insertions(+), 92 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 280d449dec..e7fe020fa7 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2466,12 +2466,6 @@ CopyFrom(CopyState cstate) */ if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { - PartitionDispatch *partition_dispatch_info; - ResultRelInfo **partitions; - TupleConversionMap **partition_tupconv_maps; - TupleTableSlot *partition_tuple_slot; - int num_parted, - num_partitions; ModifyTable *node = makeNode(ModifyTable); /* Just need make this field appear valid. */ @@ -2479,21 +2473,7 @@ CopyFrom(CopyState cstate) mtstate->ps.plan = (Plan *) node; mtstate->ps.state = estate; mtstate->resultRelInfo = resultRelInfo; - ExecSetupPartitionTupleRouting(mtstate, - cstate->rel, - node->nominalRelation, - estate, - &partition_dispatch_info, - &partitions, - &partition_tupconv_maps, - &partition_tuple_slot, - &num_parted, &num_partitions); - mtstate->mt_partition_dispatch_info = partition_dispatch_info; - mtstate->mt_num_dispatch = num_parted; - mtstate->mt_partitions = partitions; - mtstate->mt_num_partitions = num_partitions; - mtstate->mt_partition_tupconv_maps = partition_tupconv_maps; - mtstate->mt_partition_tuple_slot = partition_tuple_slot; + ExecSetupPartitionTupleRouting(mtstate, cstate->rel); /* * If we are capturing transition tuples, they may need to be diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index a40c174230..a495b165bd 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -41,42 +41,19 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel, * ExecSetupPartitionTupleRouting - set up information needed during * tuple routing for partitioned tables * - * Output arguments: - * 'pd' receives an array of PartitionDispatch objects with one entry for - * every partitioned table in the partition tree - * 'partitions' receives an array of ResultRelInfo* objects with one entry for - * every leaf partition in the partition tree - * 'tup_conv_maps' receives an array of TupleConversionMap objects with one - * entry for every leaf partition (required to convert input tuple based - * on the root table's rowtype to a leaf partition's rowtype after tuple - * routing is done) - * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used - * to manipulate any given leaf partition's rowtype after that partition - * is chosen by tuple-routing. - * 'num_parted' receives the number of partitioned tables in the partition - * tree (= the number of entries in the 'pd' output array) - * 'num_partitions' receives the number of leaf partitions in the partition - * tree (= the number of entries in the 'partitions' and 'tup_conv_maps' - * output arrays - * * Note that all the relations in the partition tree are locked using the * RowExclusiveLock mode upon return from this function. */ void -ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, - Relation rel, - Index resultRTindex, - EState *estate, - PartitionDispatch **pd, - ResultRelInfo ***partitions, - TupleConversionMap ***tup_conv_maps, - TupleTableSlot **partition_tuple_slot, - int *num_parted, int *num_partitions) +ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel) { TupleDesc tupDesc = RelationGetDescr(rel); List *leaf_parts; ListCell *cell; int i; + EState *estate = mtstate->ps.state; + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + Index resultRTindex = node->nominalRelation; ResultRelInfo *leaf_part_rri; /* @@ -84,23 +61,35 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, * partitions. */ (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); - *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts); - *num_partitions = list_length(leaf_parts); - *partitions = (ResultRelInfo **) palloc(*num_partitions * - sizeof(ResultRelInfo *)); - *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions * - sizeof(TupleConversionMap *)); + mtstate->mt_partition_dispatch_info = + RelationGetPartitionDispatchInfo(rel, + &mtstate->mt_num_dispatch, + &leaf_parts); + mtstate->mt_num_partitions = list_length(leaf_parts); /* + * Allocate an array of ResultRelInfo pointers, but actual + * ResultRelInfo's will be allocated if and when needed. See + * ExecFindPartition where it's done. + */ + mtstate->mt_partitions = (ResultRelInfo **) + palloc0(sizeof(ResultRelInfo *) * + mtstate->mt_num_partitions); + /* Ditto. */ + mtstate->mt_partition_tupconv_maps = + (TupleConversionMap **) + palloc0(sizeof(TupleConversionMap *) * + mtstate->mt_num_partitions); + /* * Initialize an empty slot that will be used to manipulate tuples of any * given partition's rowtype. It is attached to the caller-specified node * (such as ModifyTableState) and released when the node finishes * processing. */ - *partition_tuple_slot = MakeTupleTableSlot(); + mtstate->mt_partition_tuple_slot = MakeTupleTableSlot(); - leaf_part_rri = (ResultRelInfo *) palloc0(*num_partitions * - sizeof(ResultRelInfo)); + leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo) * + mtstate->mt_num_partitions); i = 0; foreach(cell, leaf_parts) { @@ -119,8 +108,10 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, * Save a tuple conversion map to convert a tuple routed to this * partition from the parent's type to the partition's. */ - (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc, - gettext_noop("could not convert row type")); + mtstate->mt_partition_tupconv_maps[i] = + convert_tuples_by_name(tupDesc, + part_tupdesc, + gettext_noop("could not convert row type")); InitResultRelInfo(leaf_part_rri, partrel, @@ -149,7 +140,7 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, estate->es_leaf_result_relations = lappend(estate->es_leaf_result_relations, leaf_part_rri); - (*partitions)[i] = leaf_part_rri++; + mtstate->mt_partitions[i] = leaf_part_rri++; i++; } } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index f836dd3703..6a3b171587 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -1942,30 +1942,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) /* Build state for INSERT tuple routing */ if (operation == CMD_INSERT && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - { - PartitionDispatch *partition_dispatch_info; - ResultRelInfo **partitions; - TupleConversionMap **partition_tupconv_maps; - TupleTableSlot *partition_tuple_slot; - int num_parted, - num_partitions; - - ExecSetupPartitionTupleRouting(mtstate, - rel, - node->nominalRelation, - estate, - &partition_dispatch_info, - &partitions, - &partition_tupconv_maps, - &partition_tuple_slot, - &num_parted, &num_partitions); - mtstate->mt_partition_dispatch_info = partition_dispatch_info; - mtstate->mt_num_dispatch = num_parted; - mtstate->mt_partitions = partitions; - mtstate->mt_num_partitions = num_partitions; - mtstate->mt_partition_tupconv_maps = partition_tupconv_maps; - mtstate->mt_partition_tuple_slot = partition_tuple_slot; - } + ExecSetupPartitionTupleRouting(mtstate, rel); /* * Build state for collecting transition tuples. This requires having a diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index 19e3b9d233..c3ddf879b9 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -50,14 +50,7 @@ typedef struct PartitionDispatchData typedef struct PartitionDispatchData *PartitionDispatch; extern void ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, - Relation rel, - Index resultRTindex, - EState *estate, - PartitionDispatch **pd, - ResultRelInfo ***partitions, - TupleConversionMap ***tup_conv_maps, - TupleTableSlot **partition_tuple_slot, - int *num_parted, int *num_partitions); + Relation rel); extern int ExecFindPartition(ModifyTableState *mtstate, TupleTableSlot *slot); #endif /* EXECPARTITION_H */ -- 2.11.0
From ed8469d38a0747fe1b3d1fb3bb8c45b4cb2a2b45 Mon Sep 17 00:00:00 2001 From: amit <amitlangot...@gmail.com> Date: Wed, 1 Nov 2017 10:31:21 +0900 Subject: [PATCH 4/4] During tuple-routing, initialize per-partition objects lazily Those objects include ResultRelInfo, tuple conversion map, WITH CHECK OPTION quals and RETURNING projections. This means we don't allocate these objects for partitions that are never inserted into. --- src/backend/commands/copy.c | 15 +-- src/backend/executor/execPartition.c | 225 ++++++++++++++++++++++++--------- src/backend/executor/nodeModifyTable.c | 108 ++-------------- src/include/nodes/execnodes.h | 1 + 4 files changed, 180 insertions(+), 169 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index e7fe020fa7..3674aea9b3 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2479,23 +2479,14 @@ CopyFrom(CopyState cstate) * If we are capturing transition tuples, they may need to be * converted from partition format back to partitioned table format * (this is only ever necessary if a BEFORE trigger modifies the - * tuple). + * tuple). Note that we don't allocate the actual maps here; they'll + * be allocated by ExecInitPartitionResultRelInfo() if and when + * needed. */ if (cstate->transition_capture != NULL) - { - int i; - mtstate->mt_transition_tupconv_maps = (TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) * mtstate->mt_num_partitions); - for (i = 0; i < mtstate->mt_num_partitions; ++i) - { - mtstate->mt_transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(mtstate->mt_partitions[i]->ri_RelationDesc), - RelationGetDescr(cstate->rel), - gettext_noop("could not convert row type")); - } - } } /* diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index a495b165bd..3e2226e5f8 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -23,6 +23,8 @@ #include "utils/rls.h" #include "utils/ruleutils.h" +static void ExecInitPartitionResultRelInfo(ModifyTableState *mtstate, + int partidx); static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, int *num_parted, List **leaf_part_oids); static void get_partition_dispatch_recurse(Relation rel, Relation parent, @@ -47,14 +49,9 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel, void ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel) { - TupleDesc tupDesc = RelationGetDescr(rel); List *leaf_parts; ListCell *cell; int i; - EState *estate = mtstate->ps.state; - ModifyTable *node = (ModifyTable *) mtstate->ps.plan; - Index resultRTindex = node->nominalRelation; - ResultRelInfo *leaf_part_rri; /* * Get the information about the partition tree after locking all the @@ -66,6 +63,11 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel) &mtstate->mt_num_dispatch, &leaf_parts); mtstate->mt_num_partitions = list_length(leaf_parts); + mtstate->mt_partition_oids = (Oid *) palloc0(sizeof(Oid) * + mtstate->mt_num_partitions); + i = 0; + foreach (cell, leaf_parts) + mtstate->mt_partition_oids[i++] = lfirst_oid(cell); /* * Allocate an array of ResultRelInfo pointers, but actual @@ -87,62 +89,6 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel) * processing. */ mtstate->mt_partition_tuple_slot = MakeTupleTableSlot(); - - leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo) * - mtstate->mt_num_partitions); - i = 0; - foreach(cell, leaf_parts) - { - Relation partrel; - TupleDesc part_tupdesc; - - /* - * We locked all the partitions above including the leaf partitions. - * Note that each of the relations in *partitions are eventually - * closed by the caller. - */ - partrel = heap_open(lfirst_oid(cell), NoLock); - part_tupdesc = RelationGetDescr(partrel); - - /* - * Save a tuple conversion map to convert a tuple routed to this - * partition from the parent's type to the partition's. - */ - mtstate->mt_partition_tupconv_maps[i] = - convert_tuples_by_name(tupDesc, - part_tupdesc, - gettext_noop("could not convert row type")); - - InitResultRelInfo(leaf_part_rri, - partrel, - resultRTindex, - rel, - estate->es_instrument); - - /* - * Verify result relation is a valid target for INSERT. - */ - CheckValidResultRel(leaf_part_rri, CMD_INSERT); - - /* - * Open partition indices. The user may have asked to check for - * conflicts within this leaf partition and do "nothing" instead of - * throwing an error. Be prepared in that case by initializing the - * index information needed by ExecInsert() to perform speculative - * insertions. - */ - if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex && - leaf_part_rri->ri_IndexRelationDescs == NULL) - ExecOpenIndices(leaf_part_rri, - mtstate != NULL && - mtstate->mt_onconflict != ONCONFLICT_NONE); - - estate->es_leaf_result_relations = - lappend(estate->es_leaf_result_relations, leaf_part_rri); - - mtstate->mt_partitions[i] = leaf_part_rri++; - i++; - } } /* @@ -257,11 +203,168 @@ ExecFindPartition(ModifyTableState *mtstate, TupleTableSlot *slot) val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0)); } + /* Initialize the partition result rel, if not done already. */ + ExecInitPartitionResultRelInfo(mtstate, result); ecxt->ecxt_scantuple = ecxt_scantuple_old; return result; } /* + * ExecInitPartitionResultRelInfo + * Initialize ResultRelInfo for a partition if not done already + */ +static void +ExecInitPartitionResultRelInfo(ModifyTableState *mtstate, int partidx) +{ + EState *estate = mtstate->ps.state; + Relation rootrel = mtstate->resultRelInfo->ri_RelationDesc; + Index resultRTindex = mtstate->resultRelInfo->ri_RangeTableIndex; + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + Relation partrel; + TupleDesc tupDesc = RelationGetDescr(rootrel), + part_tupdesc; + + /* Nothing to do if already set.*/ + if (mtstate->mt_partitions[partidx]) + return; + + mtstate->mt_partitions[partidx] = (ResultRelInfo *) + palloc0(sizeof(ResultRelInfo)); + + /* + * We locked all the partitions in ExecSetupPartitionTupleRouting + * including the leaf partitions. + */ + partrel = heap_open(mtstate->mt_partition_oids[partidx], NoLock); + part_tupdesc = RelationGetDescr(partrel); + InitResultRelInfo(mtstate->mt_partitions[partidx], + partrel, + resultRTindex, + rootrel, + estate->es_instrument); + + /* + * Verify result relation is a valid target for INSERT. + */ + CheckValidResultRel(mtstate->mt_partitions[partidx], CMD_INSERT); + + /* + * Open partition indices. The user may have asked to check for + * conflicts within this leaf partition and do "nothing" instead of + * throwing an error. Be prepared in that case by initializing the + * index information needed by ExecInsert() to perform speculative + * insertions. + */ + if (partrel->rd_rel->relhasindex && + mtstate->mt_partitions[partidx]->ri_IndexRelationDescs == NULL) + ExecOpenIndices(mtstate->mt_partitions[partidx], + mtstate->mt_onconflict != ONCONFLICT_NONE); + + /* + * Save a tuple conversion map to convert a tuple routed to this + * partition from the parent's type to the partition's. + */ + mtstate->mt_partition_tupconv_maps[partidx] = + convert_tuples_by_name(tupDesc, part_tupdesc, + gettext_noop("could not convert row type")); + + /* + * Also, if needed, the map to convert from partition's rowtype to the + * parent's that is needed to store the partition's tuples into the + * transition tuplestore which only accepts tuples of parent's rowtype. + */ + if (mtstate->mt_transition_tupconv_maps) + mtstate->mt_transition_tupconv_maps[partidx] = + convert_tuples_by_name(part_tupdesc, tupDesc, + gettext_noop("could not convert row type")); + + /* + * Build WITH CHECK OPTION constraints for each leaf partition rel. Note + * that we didn't build the withCheckOptionList for each partition within + * the planner, but simple translation of the varattnos for each partition + * will suffice. This only occurs for the INSERT case; UPDATE/DELETE + * cases are handled above. + */ + if (node && node->withCheckOptionLists != NIL) + { + List *wcoList; + List *mapped_wcoList; + List *wcoExprs = NIL; + ListCell *ll; + + /* + * In case of INSERT on partitioned tables, there is only one plan. + * Likewise, there is only one WITH CHECK OPTIONS list, not one per + * partition. We make a copy of the WCO qual for each partition; note + * that, if there are SubPlans in there, they all end up attached to + * the one parent Plan node. + */ + Assert(mtstate->operation == CMD_INSERT && + list_length(node->withCheckOptionLists) == 1 && + mtstate->mt_nplans == 1); + wcoList = linitial(node->withCheckOptionLists); + mapped_wcoList = map_partition_varattnos(wcoList, + resultRTindex, + partrel, rootrel, NULL); + foreach(ll, mapped_wcoList) + { + WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll)); + ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), + mtstate->mt_plans[0]); + wcoExprs = lappend(wcoExprs, wcoExpr); + } + + mtstate->mt_partitions[partidx]->ri_WithCheckOptions = mapped_wcoList; + mtstate->mt_partitions[partidx]->ri_WithCheckOptionExprs = wcoExprs; + } + + /* + * Build a projection for each leaf partition rel. Note that we + * didn't build the returningList for each partition within the + * planner, but simple translation of the varattnos for each partition + * will suffice. This only occurs for the INSERT case; UPDATE/DELETE + * are handled above. + */ + if (node && node->returningLists != NIL) + { + TupleTableSlot *slot; + ExprContext *econtext; + List *returningList; + List *rlist; + + returningList = linitial(node->returningLists); + + /* + * Initialize result tuple slot and assign its rowtype using the first + * RETURNING list. We assume the rest will look the same. + */ + tupDesc = ExecTypeFromTL(returningList, false); + + /* Set up a slot for the output of the RETURNING projection(s) */ + ExecInitResultTupleSlot(estate, &mtstate->ps); + ExecAssignResultType(&mtstate->ps, tupDesc); + slot = mtstate->ps.ps_ResultTupleSlot; + + /* Need an econtext too */ + if (mtstate->ps.ps_ExprContext == NULL) + ExecAssignExprContext(estate, &mtstate->ps); + econtext = mtstate->ps.ps_ExprContext; + + rlist = map_partition_varattnos(returningList, + resultRTindex, + partrel, rootrel, NULL); + mtstate->mt_partitions[partidx]->ri_projectReturning = + ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps, + part_tupdesc); + } + + /* Note that the entries in this list appear in no predetermined order. */ + estate->es_leaf_result_relations = + lappend(estate->es_leaf_result_relations, + mtstate->mt_partitions[partidx]); +} + +/* * RelationGetPartitionDispatchInfo * Returns information necessary to route tuples down a partition tree * diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 6a3b171587..8b45fdaeb7 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -1511,23 +1511,14 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) mtstate->mt_transition_tupconv_maps = (TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) * numResultRelInfos); - /* Choose the right set of partitions */ + /* + * If partition tuple-routing is active, we can't have partition + * ResultRelInfo's just yet, so return in that case. Instead, + * the conversion map will be initialized in + * ExecInitPartitionResultRelInfo() if and when needed. + */ if (mtstate->mt_partition_dispatch_info != NULL) - { - /* - * For tuple routing among partitions, we need TupleDescs based on - * the partition routing table. - */ - ResultRelInfo **resultRelInfos = mtstate->mt_partitions; - - for (i = 0; i < numResultRelInfos; ++i) - { - mtstate->mt_transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc), - RelationGetDescr(targetRelInfo->ri_RelationDesc), - gettext_noop("could not convert row type")); - } - } + return; else { /* Otherwise we need the ResultRelInfo for each subplan. */ @@ -1978,65 +1969,12 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) } /* - * Build WITH CHECK OPTION constraints for each leaf partition rel. Note - * that we didn't build the withCheckOptionList for each partition within - * the planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case; UPDATE/DELETE - * cases are handled above. - */ - if (node->withCheckOptionLists != NIL && mtstate->mt_num_partitions > 0) - { - List *wcoList; - PlanState *plan; - - /* - * In case of INSERT on partitioned tables, there is only one plan. - * Likewise, there is only one WITH CHECK OPTIONS list, not one per - * partition. We make a copy of the WCO qual for each partition; note - * that, if there are SubPlans in there, they all end up attached to - * the one parent Plan node. - */ - Assert(operation == CMD_INSERT && - list_length(node->withCheckOptionLists) == 1 && - mtstate->mt_nplans == 1); - wcoList = linitial(node->withCheckOptionLists); - plan = mtstate->mt_plans[0]; - for (i = 0; i < mtstate->mt_num_partitions; i++) - { - Relation partrel; - List *mapped_wcoList; - List *wcoExprs = NIL; - ListCell *ll; - - resultRelInfo = mtstate->mt_partitions[i]; - partrel = resultRelInfo->ri_RelationDesc; - - /* varno = node->nominalRelation */ - mapped_wcoList = map_partition_varattnos(wcoList, - node->nominalRelation, - partrel, rel, NULL); - foreach(ll, mapped_wcoList) - { - WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll)); - ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), - plan); - - wcoExprs = lappend(wcoExprs, wcoExpr); - } - - resultRelInfo->ri_WithCheckOptions = mapped_wcoList; - resultRelInfo->ri_WithCheckOptionExprs = wcoExprs; - } - } - - /* * Initialize RETURNING projections if needed. */ if (node->returningLists) { TupleTableSlot *slot; ExprContext *econtext; - List *returningList; /* * Initialize result tuple slot and assign its rowtype using the first @@ -2068,31 +2006,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) resultRelInfo->ri_RelationDesc->rd_att); resultRelInfo++; } - - /* - * Build a projection for each leaf partition rel. Note that we - * didn't build the returningList for each partition within the - * planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case; UPDATE/DELETE - * are handled above. - */ - returningList = linitial(node->returningLists); - for (i = 0; i < mtstate->mt_num_partitions; i++) - { - Relation partrel; - List *rlist; - - resultRelInfo = mtstate->mt_partitions[i]; - partrel = resultRelInfo->ri_RelationDesc; - - /* varno = node->nominalRelation */ - rlist = map_partition_varattnos(returningList, - node->nominalRelation, - partrel, rel, NULL); - resultRelInfo->ri_projectReturning = - ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps, - resultRelInfo->ri_RelationDesc->rd_att); - } } else { @@ -2367,8 +2280,11 @@ ExecEndModifyTable(ModifyTableState *node) { ResultRelInfo *resultRelInfo = node->mt_partitions[i]; - ExecCloseIndices(resultRelInfo); - heap_close(resultRelInfo->ri_RelationDesc, NoLock); + if (resultRelInfo) + { + ExecCloseIndices(resultRelInfo); + heap_close(resultRelInfo->ri_RelationDesc, NoLock); + } } /* Release the standalone partition tuple descriptor, if any */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 1a35c5c9ad..988a374a74 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -982,6 +982,7 @@ typedef struct ModifyTableState int mt_num_dispatch; /* Number of entries in the above array */ int mt_num_partitions; /* Number of members in the following * arrays */ + Oid *mt_partition_oids; /* Per partition OIDs */ ResultRelInfo **mt_partitions; /* Per partition result relation pointers */ TupleConversionMap **mt_partition_tupconv_maps; /* Per partition tuple conversion map */ -- 2.11.0