Hello hackers, Recently I was working with sql arrays in postgres and it turned out that postgres doesn't have such very convinient functional constructions as map, reduce and filter. Currently to map function over array user has to make a subquery like:
select u.* from my_table, lateral ( select array_agg(lower(elem)) from unnest(arr) as elem ) as u; Which is not only inconvenient but not very efficient as well (see 'Demo' section below). When I dug into the code I found that postgres already has the needed infrastructure for implementing map for arrays; actually array coercing already works that way (it basically maps cast function). In the attached patch there is a simple map implementation which introduces new expression type and syntax: MAP(<func_name> OVER <array_expression>) For example: SELECT MAP(upper OVER array['one', 'two', 'three']::text[]); ?column? ----------------- {ONE,TWO,THREE} (1 row) This is probably not the most useful notation and it would be better to have syntax for mapping arbitrary expressions over array, not just function. I'm struggling to come up with a good idea of how it should look like. It could look something like following: MAP(<expr> FOR <placeholder> IN <array_expressin>) For instance: SELECT MAP(x*2 FOR x IN array[1, 2, 3]::int[]); Looking forward for community's suggestions! Demo ---- Here is a small comparison between map and unnest/aggregate ways for per-element processing of arrays. Given a table with 1K rows which contains single column of text[] type. Each array contains 5/10/100 elements. create table my_table (arr text[]); insert into my_table select array_agg(md5(random()::text)) from generate_series(1, 1000) as rows, generate_series(1, 10) as elements group by rows; There are two scripts for pgbench. One for 'map' syntax: select map(upper over arr) from my_table; And one for unnest/aggregate: select u.* from my_table, lateral ( select array_agg(upper(elem)) from unnest(arr) as elem ) as u; Results are: elements per array | map (tps) | unnest/aggregate (tps) --------------------+------------+------------------------ 5 | 139.105359 | 74.434010 10 | 74.089743 | 43.622554 100 | 7.693000 | 5.325805 Apparently map is more efficient for small arrays. And as the size of array increases the difference decreases. I'll be glad to any input from the community. Thanks! -- Ildar Musin i.mu...@postgrespro.ru
diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c index e284fd7..85d76b2 100644 --- a/src/backend/executor/execExpr.c +++ b/src/backend/executor/execExpr.c @@ -886,6 +886,56 @@ ExecInitExprRec(Expr *node, ExprState *state, break; } + case T_MapExpr: + { + MapExpr *map = (MapExpr *) node; + ExprState *elemstate; + Oid resultelemtype; + + ExecInitExprRec(map->arrexpr, state, resv, resnull); + + resultelemtype = get_element_type(map->resulttype); + if (!OidIsValid(resultelemtype)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("target type is not an array"))); + + /* Construct a sub-expression for the per-element expression */ + elemstate = makeNode(ExprState); + elemstate->expr = map->elemexpr; + elemstate->parent = state->parent; + elemstate->ext_params = state->ext_params; + elemstate->innermost_caseval = (Datum *) palloc(sizeof(Datum)); + elemstate->innermost_casenull = (bool *) palloc(sizeof(bool)); + + ExecInitExprRec(map->elemexpr, elemstate, + &elemstate->resvalue, &elemstate->resnull); + + /* Append a DONE step and ready the subexpression */ + scratch.opcode = EEOP_DONE; + ExprEvalPushStep(elemstate, &scratch); + ExecReadyExpr(elemstate); + + scratch.opcode = EEOP_MAP; + scratch.d.map.elemexprstate = elemstate; + scratch.d.map.resultelemtype = resultelemtype; + + if (elemstate) + { + /* Set up workspace for array_map */ + scratch.d.map.amstate = + (ArrayMapState *) palloc0(sizeof(ArrayMapState)); + } + else + { + /* Don't need workspace if there's no subexpression */ + scratch.d.map.amstate = NULL; + } + + ExprEvalPushStep(state, &scratch); + break; + } + case T_OpExpr: { OpExpr *op = (OpExpr *) node; diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index 9d6e25a..b2cbc45 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -328,6 +328,7 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull) &&CASE_EEOP_FUNCEXPR_STRICT, &&CASE_EEOP_FUNCEXPR_FUSAGE, &&CASE_EEOP_FUNCEXPR_STRICT_FUSAGE, + &&CASE_EEOP_MAP, &&CASE_EEOP_BOOL_AND_STEP_FIRST, &&CASE_EEOP_BOOL_AND_STEP, &&CASE_EEOP_BOOL_AND_STEP_LAST, @@ -699,6 +700,13 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull) EEO_NEXT(); } + EEO_CASE(EEOP_MAP) + { + ExecEvalMapExpr(state, op, econtext); + + EEO_NEXT(); + } + /* * If any of its clauses is FALSE, an AND's result is FALSE regardless * of the states of the rest of the clauses, so we can stop evaluating @@ -2230,6 +2238,33 @@ ExecEvalFuncExprStrictFusage(ExprState *state, ExprEvalStep *op, } /* + * Evaluate a MapExpr expression. + * + * Source array is in step's result variable. + */ +void +ExecEvalMapExpr(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + Datum arraydatum; + + /* NULL array -> NULL result */ + if (*op->resnull) + return; + + arraydatum = *op->resvalue; + Assert(op->d.map.elemexprstate != NULL); + + /* + * Use array_map to apply the sub-expression to each array element. + */ + *op->resvalue = array_map(arraydatum, + op->d.map.elemexprstate, + econtext, + op->d.map.resultelemtype, + op->d.map.amstate); +} + +/* * Evaluate a PARAM_EXEC parameter. * * PARAM_EXEC params (internal executor parameters) are stored in the diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 7c045a7..92b65d2 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -2621,6 +2621,31 @@ _copyFuncCall(const FuncCall *from) return newnode; } +static A_MapExpr * +_copyAMapExpr(const A_MapExpr *from) +{ + A_MapExpr *newnode = makeNode(A_MapExpr); + + COPY_NODE_FIELD(funcname); + COPY_NODE_FIELD(arrexpr); + COPY_LOCATION_FIELD(location); + + return newnode; +} + +static MapExpr * +_copyMapExpr(const MapExpr *from) +{ + MapExpr *newnode = makeNode(MapExpr); + + COPY_NODE_FIELD(elemexpr); + COPY_NODE_FIELD(arrexpr); + COPY_SCALAR_FIELD(resulttype); + COPY_SCALAR_FIELD(resultcollid); + + return newnode; +} + static A_Star * _copyAStar(const A_Star *from) { @@ -5496,6 +5521,12 @@ copyObjectImpl(const void *from) case T_FuncCall: retval = _copyFuncCall(from); break; + case T_A_MapExpr: + retval = _copyAMapExpr(from); + break; + case T_MapExpr: + retval = _copyMapExpr(from); + break; case T_A_Star: retval = _copyAStar(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 6a971d0..ba30d39 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2350,6 +2350,27 @@ _equalFuncCall(const FuncCall *a, const FuncCall *b) } static bool +_equalAMapExpr(const A_MapExpr *a, const A_MapExpr *b) +{ + COMPARE_NODE_FIELD(funcname); + COMPARE_NODE_FIELD(arrexpr); + COMPARE_LOCATION_FIELD(location); + + return true; +} + +static bool +_equalMapExpr(const MapExpr *a, const MapExpr *b) +{ + COMPARE_NODE_FIELD(elemexpr); + COMPARE_NODE_FIELD(arrexpr); + COMPARE_SCALAR_FIELD(resulttype); + COMPARE_SCALAR_FIELD(resultcollid); + + return true; +} + +static bool _equalAStar(const A_Star *a, const A_Star *b) { return true; @@ -3573,6 +3594,12 @@ equal(const void *a, const void *b) case T_FuncCall: retval = _equalFuncCall(a, b); break; + case T_A_MapExpr: + retval = _equalAMapExpr(a, b); + break; + case T_MapExpr: + retval = _equalMapExpr(a, b); + break; case T_A_Star: retval = _equalAStar(a, b); break; diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c index 1bd2599..dfdd21b 100644 --- a/src/backend/nodes/makefuncs.c +++ b/src/backend/nodes/makefuncs.c @@ -600,6 +600,20 @@ makeFuncCall(List *name, List *args, int location) } /* + * makeMapExpr + * + */ +A_MapExpr * +makeAMapExpr(List *funcname, Node *arrexpr) +{ + A_MapExpr *n = makeNode(A_MapExpr); + + n->funcname = funcname; + n->arrexpr = arrexpr; + return n; +} + +/* * makeGroupingSet * */ diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index a10014f..65bc6b1 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -80,6 +80,8 @@ exprType(const Node *expr) case T_FuncExpr: type = ((const FuncExpr *) expr)->funcresulttype; break; + case T_MapExpr: + return ((MapExpr *) expr)->resulttype; case T_NamedArgExpr: type = exprType((Node *) ((const NamedArgExpr *) expr)->arg); break; @@ -750,6 +752,9 @@ exprCollation(const Node *expr) case T_FuncExpr: coll = ((const FuncExpr *) expr)->funccollid; break; + case T_MapExpr: + coll = ((const MapExpr *) expr)->resultcollid; + break; case T_NamedArgExpr: coll = exprCollation((Node *) ((const NamedArgExpr *) expr)->arg); break; @@ -994,6 +999,9 @@ exprSetCollation(Node *expr, Oid collation) case T_FuncExpr: ((FuncExpr *) expr)->funccollid = collation; break; + case T_MapExpr: + ((MapExpr *) expr)->resultcollid = collation; + break; case T_NamedArgExpr: Assert(collation == exprCollation((Node *) ((NamedArgExpr *) expr)->arg)); break; @@ -1132,6 +1140,10 @@ exprSetInputCollation(Node *expr, Oid inputcollation) case T_FuncExpr: ((FuncExpr *) expr)->inputcollid = inputcollation; break; + case T_MapExpr: + exprSetInputCollation((Node *) ((MapExpr *) expr)->elemexpr, + inputcollation); + break; case T_OpExpr: ((OpExpr *) expr)->inputcollid = inputcollation; break; @@ -1937,6 +1949,16 @@ expression_tree_walker(Node *node, return true; } break; + case T_MapExpr: + { + MapExpr *map = (MapExpr *) node; + + if (walker((Node *) map->arrexpr, context)) + return true; + if (walker((Node *) map->elemexpr, context)) + return true; + } + break; case T_NamedArgExpr: return walker(((NamedArgExpr *) node)->arg, context); case T_OpExpr: @@ -2566,6 +2588,15 @@ expression_tree_mutator(Node *node, return (Node *) newnode; } break; + case T_MapExpr: + { + MapExpr *expr = (MapExpr *) node; + MapExpr *newnode; + + FLATCOPY(newnode, expr, MapExpr); + MUTATE(newnode->arrexpr, expr->arrexpr, Expr *); + return (Node *) newnode; + } case T_NamedArgExpr: { NamedArgExpr *nexpr = (NamedArgExpr *) node; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 1da9d7e..3f6578f 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2793,6 +2793,27 @@ _outFuncCall(StringInfo str, const FuncCall *node) } static void +_outAMapExpr(StringInfo str, const A_MapExpr *node) +{ + WRITE_NODE_TYPE("A_MAPEXPR"); + + WRITE_NODE_FIELD(funcname); + WRITE_NODE_FIELD(arrexpr); + WRITE_LOCATION_FIELD(location); +} + +static void +_outMapExpr(StringInfo str, const MapExpr *node) +{ + WRITE_NODE_TYPE("MAPEXPR"); + + WRITE_NODE_FIELD(elemexpr); + WRITE_NODE_FIELD(arrexpr); + WRITE_OID_FIELD(resulttype); + WRITE_OID_FIELD(resultcollid); +} + +static void _outDefElem(StringInfo str, const DefElem *node) { WRITE_NODE_TYPE("DEFELEM"); @@ -4278,6 +4299,12 @@ outNode(StringInfo str, const void *obj) case T_FuncCall: _outFuncCall(str, obj); break; + case T_A_MapExpr: + _outAMapExpr(str, obj); + break; + case T_MapExpr: + _outMapExpr(str, obj); + break; case T_DefElem: _outDefElem(str, obj); break; diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index babb62d..fe3027b 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -563,6 +563,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type <node> func_application func_expr_common_subexpr %type <node> func_expr func_expr_windowless +%type <node> map_expr %type <node> common_table_expr %type <with> with_clause opt_with_clause %type <list> cte_list @@ -652,7 +653,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); LEADING LEAKPROOF LEAST LEFT LEVEL LIKE LIMIT LISTEN LOAD LOCAL LOCALTIME LOCALTIMESTAMP LOCATION LOCK_P LOCKED LOGGED - MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE + MAP MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF @@ -13461,6 +13462,8 @@ c_expr: columnref { $$ = $1; } } | case_expr { $$ = $1; } + | map_expr + { $$ = $1; } | func_expr { $$ = $1; } | select_with_parens %prec UMINUS @@ -13556,6 +13559,14 @@ c_expr: columnref { $$ = $1; } } ; +map_expr: MAP '(' type_function_name OVER a_expr ')' + { + A_MapExpr *m = makeAMapExpr(list_make1(makeString($3)), $5); + m->location = @1; + $$ = (Node *)m; + } + ; + func_application: func_name '(' ')' { $$ = (Node *) makeFuncCall($1, NIL, @1); @@ -15423,6 +15434,7 @@ reserved_keyword: | LIMIT | LOCALTIME | LOCALTIMESTAMP + | MAP | NOT | NULL_P | OFFSET diff --git a/src/backend/parser/parse_collate.c b/src/backend/parser/parse_collate.c index 6d34245..0021b24 100644 --- a/src/backend/parser/parse_collate.c +++ b/src/backend/parser/parse_collate.c @@ -667,6 +667,15 @@ assign_collations_walker(Node *node, assign_collations_context *context) &loccontext); } break; + case T_MapExpr: + { + MapExpr *map = (MapExpr *) node; + + (void) expression_tree_walker((Node *) map->elemexpr, + assign_collations_walker, + (void *) &loccontext); + } + break; default: /* diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c index 385e54a..64b7ea5 100644 --- a/src/backend/parser/parse_expr.c +++ b/src/backend/parser/parse_expr.c @@ -101,6 +101,7 @@ static Node *transformAExprIn(ParseState *pstate, A_Expr *a); static Node *transformAExprBetween(ParseState *pstate, A_Expr *a); static Node *transformBoolExpr(ParseState *pstate, BoolExpr *a); static Node *transformFuncCall(ParseState *pstate, FuncCall *fn); +static Node *transformMapExpr(ParseState *pstate, A_MapExpr *a_mapexpr); static Node *transformMultiAssignRef(ParseState *pstate, MultiAssignRef *maref); static Node *transformCaseExpr(ParseState *pstate, CaseExpr *c); static Node *transformSubLink(ParseState *pstate, SubLink *sublink); @@ -258,6 +259,10 @@ transformExprRecurse(ParseState *pstate, Node *expr) break; } + case T_A_MapExpr: + result = transformMapExpr(pstate, (A_MapExpr *) expr); + break; + case T_BoolExpr: result = transformBoolExpr(pstate, (BoolExpr *) expr); break; @@ -1486,6 +1491,55 @@ transformFuncCall(ParseState *pstate, FuncCall *fn) } static Node * +transformMapExpr(ParseState *pstate, A_MapExpr *a_mapexpr) +{ + MapExpr *map; + CaseTestExpr *placeholder; + Node *arrexpr; + Oid funcId; + Oid sourceElemType; + Oid targetElemType; + FuncExpr *elemexpr; + Oid collation; + + arrexpr = transformExprRecurse(pstate, a_mapexpr->arrexpr); + + sourceElemType = get_element_type(exprType(arrexpr)); + if (sourceElemType == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("array expression is expected"), + parser_errposition(pstate, exprLocation(a_mapexpr->arrexpr)))); + + collation = IsA(arrexpr, ArrayExpr) ? + get_typcollation(sourceElemType) : + exprCollation(arrexpr); + + /* Create placeholder for per-element expression */ + placeholder = makeNode(CaseTestExpr); + placeholder->typeId = sourceElemType; + placeholder->typeMod = exprTypmod(arrexpr); + placeholder->collation = collation; + + /* Build per-element expression */ + funcId = LookupFuncName(a_mapexpr->funcname, 1, &sourceElemType, false); + targetElemType = get_func_rettype(funcId); + elemexpr = makeFuncExpr(funcId, + targetElemType, + list_make1(placeholder), + InvalidOid, + InvalidOid, /* will be set by parse_collate.c */ + COERCE_EXPLICIT_CALL); + + map = makeNode(MapExpr); + map->arrexpr = (Expr *) arrexpr; + map->elemexpr = (Expr *) elemexpr; + map->resulttype = get_array_type(targetElemType); + + return (Node *) map; +} + +static Node * transformMultiAssignRef(ParseState *pstate, MultiAssignRef *maref) { SubLink *sublink; diff --git a/src/include/executor/execExpr.h b/src/include/executor/execExpr.h index f7b1f77..ddad247 100644 --- a/src/include/executor/execExpr.h +++ b/src/include/executor/execExpr.h @@ -92,6 +92,11 @@ typedef enum ExprEvalOp EEOP_FUNCEXPR_STRICT_FUSAGE, /* + * Evaluate map expression + */ + EEOP_MAP, + + /* * Evaluate boolean AND expression, one step per subexpression. FIRST/LAST * subexpressions are special-cased for performance. Since AND always has * at least two subexpressions, FIRST and LAST never apply to the same @@ -318,6 +323,13 @@ typedef struct ExprEvalStep int nargs; /* number of arguments */ } func; + struct + { + ExprState *elemexprstate; /* null if no per-element work */ + Oid resultelemtype; /* element type of result array */ + struct ArrayMapState *amstate; /* workspace for array_map */ + } map; + /* for EEOP_BOOL_*_STEP */ struct { @@ -695,6 +707,8 @@ extern void ExecEvalFuncExprFusage(ExprState *state, ExprEvalStep *op, ExprContext *econtext); extern void ExecEvalFuncExprStrictFusage(ExprState *state, ExprEvalStep *op, ExprContext *econtext); +extern void ExecEvalMapExpr(ExprState *state, ExprEvalStep *op, + ExprContext *econtext); extern void ExecEvalParamExec(ExprState *state, ExprEvalStep *op, ExprContext *econtext); extern void ExecEvalParamExecParams(Bitmapset *params, EState *estate); diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h index 57bd52f..7dc3937 100644 --- a/src/include/nodes/makefuncs.h +++ b/src/include/nodes/makefuncs.h @@ -79,6 +79,7 @@ extern FuncExpr *makeFuncExpr(Oid funcid, Oid rettype, List *args, Oid funccollid, Oid inputcollid, CoercionForm fformat); extern FuncCall *makeFuncCall(List *name, List *args, int location); +extern A_MapExpr * makeAMapExpr(List *funcname, Node *arrexpr); extern DefElem *makeDefElem(char *name, Node *arg, int location); extern DefElem *makeDefElemExtended(char *nameSpace, char *name, Node *arg, diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index adb159a..d34abc2 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -428,6 +428,8 @@ typedef enum NodeTag T_ParamRef, T_A_Const, T_FuncCall, + T_A_MapExpr, + T_MapExpr, T_A_Star, T_A_Indices, T_A_Indirection, diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 6390f7e..9c185c7 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -418,6 +418,17 @@ typedef struct A_ArrayExpr } A_ArrayExpr; /* + * A_MapExpr - array map expression + */ +typedef struct A_MapExpr +{ + NodeTag type; + List *funcname; + Node *arrexpr; + int location; /* token location, or -1 if unknown */ +} A_MapExpr; + +/* * ResTarget - * result target (used in target list of pre-transformed parse trees) * diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h index f90aa7b..f108898 100644 --- a/src/include/nodes/primnodes.h +++ b/src/include/nodes/primnodes.h @@ -460,6 +460,18 @@ typedef struct FuncExpr } FuncExpr; /* + * MapExpr - array map expression + */ +typedef struct MapExpr +{ + NodeTag type; + Expr *elemexpr; /* expression representing per-element work */ + Expr *arrexpr; /* source expression of array type */ + Oid resulttype; /* OID of target array type */ + Oid resultcollid; /* OID of collation, or InvalidOid if none */ +} MapExpr; + +/* * NamedArgExpr - a named argument of a function * * This node type can only appear in the args list of a FuncCall or FuncExpr diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index 23db401..d6c5254 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -243,6 +243,7 @@ PG_KEYWORD("location", LOCATION, UNRESERVED_KEYWORD) PG_KEYWORD("lock", LOCK_P, UNRESERVED_KEYWORD) PG_KEYWORD("locked", LOCKED, UNRESERVED_KEYWORD) PG_KEYWORD("logged", LOGGED, UNRESERVED_KEYWORD) +PG_KEYWORD("map", MAP, RESERVED_KEYWORD) PG_KEYWORD("mapping", MAPPING, UNRESERVED_KEYWORD) PG_KEYWORD("match", MATCH, UNRESERVED_KEYWORD) PG_KEYWORD("materialized", MATERIALIZED, UNRESERVED_KEYWORD)