On Sat, Oct 21, 2023 at 7:39 PM Tatsuo Ishii <is...@sraoss.co.jp> wrote:
> Attached is the v10 patch. This version enhances the performance of
> pattern matching.

Nice! I've attached a couple of more stressful tests (window
partitions of 1000 rows each). Beware that the second one runs my
desktop out of memory fairly quickly with the v10 implementation.

I was able to carve out some time this week to implement a very basic
recursive NFA, which handles both the + and * qualifiers (attached).
It's not production quality -- a frame on the call stack for every row
isn't going to work -- but with only those two features, it's pretty
tiny, and it's able to run the new stress tests with no issue. If I
continue to have time, I hope to keep updating this parallel
implementation as you add features to the StringSet implementation,
and we can see how it evolves. I expect that alternation and grouping
will ratchet up the complexity.

Thanks!
--Jacob
From fb3cbb6f99f0fe7b05027759454d7e0013225929 Mon Sep 17 00:00:00 2001
From: Jacob Champion <champio...@gmail.com>
Date: Fri, 20 Oct 2023 16:11:14 -0700
Subject: [PATCH 2/2] squash! Row pattern recognition patch (executor).

- Extract pattern matching logic into match_pattern().
- Replace the str_set/initials implementation with a recursive
  backtracker.
- Rework match_pattern in preparation for * quantifier: separate success
  from number of matched rows.
- Handle `*` quantifier
- Simplify the evaluate_pattern() signature.
- Remove defineInitial and related code.
---
 src/backend/executor/nodeWindowAgg.c    | 871 +++---------------------
 src/backend/optimizer/plan/createplan.c |   4 -
 src/backend/parser/parse_clause.c       |  31 -
 src/include/nodes/execnodes.h           |   1 -
 src/include/nodes/parsenodes.h          |   2 -
 src/include/nodes/plannodes.h           |   3 -
 6 files changed, 91 insertions(+), 821 deletions(-)

diff --git a/src/backend/executor/nodeWindowAgg.c 
b/src/backend/executor/nodeWindowAgg.c
index 2e1baef7ea..30abe60159 100644
--- a/src/backend/executor/nodeWindowAgg.c
+++ b/src/backend/executor/nodeWindowAgg.c
@@ -161,40 +161,6 @@ typedef struct WindowStatePerAggData
        bool            restart;                /* need to restart this agg in 
this cycle? */
 } WindowStatePerAggData;
 
-/*
- * Set of StringInfo. Used in RPR.
- */
-typedef struct StringSet {
-       StringInfo      *str_set;
-       Size            set_size;       /* current array allocation size in 
number of items */
-       int                     set_index;      /* current used size */
-} StringSet;
-
-/*
- * Allowed subsequent PATTERN variables positions.
- * Used in RPR.
- *
- * pos represents the pattern variable defined order in DEFINE caluase.  For
- * example. "DEFINE START..., UP..., DOWN ..." and "PATTERN START UP DOWN UP"
- * will create:
- * VariablePos[0].pos[0] = 0;          START
- * VariablePos[1].pos[0] = 1;          UP
- * VariablePos[1].pos[1] = 3;          UP
- * VariablePos[2].pos[0] = 2;          DOWN
- *
- * Note that UP has two pos because UP appears in PATTERN twice.
- *
- * By using this strucrture, we can know which pattern variable can be followed
- * by which pattern variable(s). For example, START can be followed by UP and
- * DOWN since START's pos is 0, and UP's pos is 1 or 3, DOWN's pos is 2.
- * DOWN can be followed by UP since UP's pos is either 1 or 3. 
- * 
- */
-#define NUM_ALPHABETS  26      /* we allow [a-z] variable initials */
-typedef struct VariablePos {
-       int                     pos[NUM_ALPHABETS];     /* postion(s) in 
PATTERN */
-} VariablePos;
-
 static void initialize_windowaggregate(WindowAggState *winstate,
                                                                           
WindowStatePerFunc perfuncstate,
                                                                           
WindowStatePerAgg peraggstate);
@@ -249,27 +215,11 @@ static void register_reduced_frame_map(WindowAggState 
*winstate, int64 pos, int
 static void clear_reduced_frame_map(WindowAggState *winstate);
 static void update_reduced_frame(WindowObject winobj, int64 pos);
 
-static int64 evaluate_pattern(WindowObject winobj, int64 current_pos,
-                                                         char *vname, 
StringInfo encoded_str, bool *result);
+static bool evaluate_pattern(WindowObject winobj, int64 current_pos,
+                                                        char *vname);
 
 static bool get_slots(WindowObject winobj, int64 current_pos);
 
-//static int   search_str_set(char *pattern, StringSet *str_set, int 
*initial_orders);
-static int search_str_set(char *pattern, StringSet *str_set, VariablePos 
*variable_pos);
-static char    pattern_initial(WindowAggState *winstate, char *vname);
-static int do_pattern_match(char *pattern, char *encoded_str);
-
-static StringSet *string_set_init(void);
-static void string_set_add(StringSet *string_set, StringInfo str);
-static StringInfo string_set_get(StringSet *string_set, int index);
-static int string_set_get_size(StringSet *string_set);
-static void string_set_discard(StringSet *string_set);
-static VariablePos *variable_pos_init(void);
-static void variable_pos_register(VariablePos *variable_pos, char initial, int 
pos);
-static bool variable_pos_compare(VariablePos *variable_pos, char initial1, 
char initial2);
-static int variable_pos_fetch(VariablePos *variable_pos, char initial, int 
index);
-static void variable_pos_discard(VariablePos *variable_pos);
-
 /*
  * initialize_windowaggregate
  * parallel to initialize_aggregates in nodeAgg.c
@@ -2839,7 +2789,6 @@ ExecInitWindowAgg(WindowAgg *node, EState *estate, int 
eflags)
        winstate->patternRegexpList = node->patternRegexp;
 
        /* Set up row pattern recognition DEFINE clause */
-       winstate->defineInitial = node->defineInitial;
        winstate->defineVariableList = NIL;
        winstate->defineClauseList = NIL;
        if (node->defineClause != NIL)
@@ -4063,145 +4012,75 @@ void register_reduced_frame_map(WindowAggState 
*winstate, int64 pos, int val)
 }
 
 /*
- * update_reduced_frame
- *             Update reduced frame info.
+ * Match the pattern variables in the WindowObject against the rows at a given
+ * position. Returns true if the pattern matches successfully, and stores the
+ * number of matched rows. (*rows may be zero on success; consider for example
+ * the pattern `A*`.)
  */
-static
-void update_reduced_frame(WindowObject winobj, int64 pos)
+static bool
+match_pattern(WindowObject winobj, int64 pos, int pattern_index, int *rows)
 {
        WindowAggState *winstate = winobj->winstate;
-       ListCell        *lc1, *lc2;
-       bool            expression_result;
-       int                     num_matched_rows;
-       int64           original_pos;
-       bool            anymatch;
-       StringInfo      encoded_str;
-       StringInfo      pattern_str = makeStringInfo();
-       StringSet       *str_set;
-       int                     str_set_index = 0;
-       int                     initial_index;
-       VariablePos     *variable_pos;
+       int                     submatch_rows = 0;
 
-       /*
-        * Set of pattern variables evaluated to true.
-        * Each character corresponds to pattern variable.
-        * Example:
-        * str_set[0] = "AB";
-        * str_set[1] = "AC";
-        * In this case at row 0 A and B are true, and A and C are true in row 
1.
-        */
+       char       *vname = strVal(list_nth(winstate->patternVariableList, 
pattern_index));
+       char       *quantifier = strVal(list_nth(winstate->patternRegexpList, 
pattern_index));
 
-       /* initialize pattern variables set */
-       str_set = string_set_init();
+       *rows = 0;
 
-       /* save original pos */
-       original_pos = pos;
-
-       /*
-        * Loop over until none of pattern matches or encounters end of frame.
-        */
-       for (;;)
+       /* Try to match the current row against the pattern variable. */
+       if (!evaluate_pattern(winobj, pos, vname))
        {
-               int64   result_pos = -1;
-
-               /*
-                * Loop over each PATTERN variable.
-                */
-               anymatch = false;
-               encoded_str = makeStringInfo();
-
-               forboth(lc1, winstate->patternVariableList, lc2, 
winstate->patternRegexpList)
-               {
-                       char    *vname = strVal(lfirst(lc1));
-                       char    *quantifier = strVal(lfirst(lc2));
-
-                       elog(DEBUG1, "pos: " INT64_FORMAT " pattern vname: %s 
quantifier: %s", pos, vname, quantifier);
-
-                       expression_result = false;
-
-                       /* evaluate row pattern against current row */
-                       result_pos = evaluate_pattern(winobj, pos, vname, 
encoded_str, &expression_result);
-                       if (expression_result)
-                       {
-                               elog(DEBUG1, "expression result is true");
-                               anymatch = true;
-                       }
-
-                       /*
-                        * If out of frame, we are done.
-                        */
-                        if (result_pos < 0)
-                                break;
-               }
-
-               if (!anymatch)
-               {
-                       /* none of patterns matched. */
-                       break;
-               }
-
-               string_set_add(str_set, encoded_str);
-
-               elog(DEBUG1, "pos: " INT64_FORMAT " str_set_index: %d 
encoded_str: %s", pos, str_set_index, encoded_str->data);
-
-               /* move to next row */
+               /* A failed match is only okay if we have the proper 
quantifier. */
+               if (quantifier[0] != '*')
+                       return false;
+       }
+       else
+       {
+               /* Matched. Advance to the next row. */
+               (*rows)++;
                pos++;
 
-               if (result_pos < 0)
+               if (quantifier[0] == '+' || quantifier[0] == '*')
                {
-                       /* out of frame */
-                       break;
+                       /* Greedy. Try to rematch from the current variable 
first. */
+                       if (match_pattern(winobj, pos, pattern_index, 
&submatch_rows))
+                               goto success;
                }
        }
 
-       if (string_set_get_size(str_set) == 0)
+       /* If there's more to the pattern, try to match it now. */
+       if (pattern_index + 1 < list_length(winstate->patternVariableList))
        {
-               /* no match found in the first row */
-               register_reduced_frame_map(winstate, original_pos, 
RF_UNMATCHED);
-               return;
-       }
-
-       elog(DEBUG2, "pos: " INT64_FORMAT " encoded_str: %s", pos, 
encoded_str->data);
-
-       /* build regular expression */
-       pattern_str = makeStringInfo();
-       appendStringInfoChar(pattern_str, '^');
-       initial_index = 0;
-
-       variable_pos = variable_pos_init();
-
-       forboth (lc1, winstate->patternVariableList, lc2, 
winstate->patternRegexpList)
-       {
-               char    *vname = strVal(lfirst(lc1));
-               char    *quantifier = strVal(lfirst(lc2));
-               char     initial;
-
-               initial = pattern_initial(winstate, vname);
-               Assert(initial != 0);
-               appendStringInfoChar(pattern_str, initial);
-               if (quantifier[0])
-                       appendStringInfoChar(pattern_str, quantifier[0]);
+               if (match_pattern(winobj, pos, pattern_index + 1, 
&submatch_rows))
+                       goto success;
 
                /*
-                * Register the initial at initial_index. If the initial 
appears more
-                * than once, all of it's initial_index will be recorded. This 
could
-                * happen if a pattern variable appears in the PATTERN clause 
more
-                * than once like "UP DOWN UP" "UP UP UP".
+                * We're not to the end of the pattern, and there's no way to 
advance
+                * the machine, so fail.
+                *
+                * TODO: reluctant qualifiers add another possible transition 
here
                 */
-               variable_pos_register(variable_pos, initial, initial_index);
-
-               initial_index++;
+               *rows = 0;
+               return false;
        }
 
-       elog(DEBUG2, "pos: " INT64_FORMAT " pattern: %s", pos, 
pattern_str->data);
+success:
+       *rows += submatch_rows;
+       return true;
+}
 
-       /* look for matching pattern variable sequence */
-       elog(DEBUG1, "search_str_set started");
-       num_matched_rows = search_str_set(pattern_str->data, str_set, 
variable_pos);
-       elog(DEBUG1, "search_str_set returns: %d", num_matched_rows);
+/*
+ * update_reduced_frame
+ *             Update reduced frame info.
+ */
+static
+void update_reduced_frame(WindowObject winobj, int64 pos)
+{
+       WindowAggState *winstate = winobj->winstate;
+       int                     num_matched_rows;
 
-       variable_pos_discard(variable_pos);
-       string_set_discard(str_set);
+       match_pattern(winobj, pos, 0, &num_matched_rows);
 
        /*
         * We are at the first row in the reduced frame.  Save the number of
@@ -4210,15 +4089,15 @@ void update_reduced_frame(WindowObject winobj, int64 
pos)
        if (num_matched_rows <= 0)
        {
                /* no match */
-               register_reduced_frame_map(winstate, original_pos, 
RF_UNMATCHED);
+               register_reduced_frame_map(winstate, pos, RF_UNMATCHED);
        }
        else
        {
                int64           i;
 
-               register_reduced_frame_map(winstate, original_pos, 
RF_FRAME_HEAD);
+               register_reduced_frame_map(winstate, pos, RF_FRAME_HEAD);
 
-               for (i = original_pos + 1; i < original_pos + num_matched_rows; 
i++)
+               for (i = pos + 1; i < pos + num_matched_rows; i++)
                {
                        register_reduced_frame_map(winstate, i, RF_SKIPPED);
                }
@@ -4228,436 +4107,76 @@ void update_reduced_frame(WindowObject winobj, int64 
pos)
 }
 
 /*
- * Perform pattern matching using pattern against str_set. pattern is a
- * regular expression derived from PATTERN clause. Note that the regular
- * expression string is prefixed by '^' and followed by initials represented
- * in a same way as str_set. str_set is a set of StringInfo. Each StringInfo
- * has a string comprising initials of pattern variable strings being true in
- * a row. The initials are one of [a-y], parallel to the order of variable
- * names in DEFINE clause. For example, if DEFINE has variables START, UP and
- * DOWN, PATTERN HAS START, UP and DOWN, then the initials in PATTERN will be
- * 'a', 'b' and 'c'.
- *
- * variable_pos is an array representing the order of pattern variable string
- * initials in PATTERN clause.  For example initial 'a' potion is in
- * variable_pos[0].pos[0] = 0. Note that if the pattern is "START UP DOWN UP"
- * (UP appears twice), then "UP" (initial is 'b') has two position 1 and
- * 3. Thus variable_pos for b is variable_pos[1].pos[0] = 1 and
- * variable_pos[1].pos[1] = 3.
- *
- * Returns the longest number of the matching rows.
+ * Evaluate expression associated with PATTERN variable vname for the given row
+ * position, and return the result.
  */
-static
-int search_str_set(char *pattern, StringSet *str_set, VariablePos 
*variable_pos)
-{
-#define        MAX_CANDIDATE_NUM       10000   /* max pattern match candidate 
size */
-#define        FREEZED_CHAR    'Z'     /* a pattern is freezed if it ends with 
the char */
-#define        DISCARD_CHAR    'z'     /* a pattern is not need to keep */
-
-       int                     set_size;       /* number of rows in the set */
-       int                     resultlen;
-       int                     index;
-       StringSet       *old_str_set, *new_str_set;
-       int                     new_str_size;
-       int                     len;
-
-       set_size = string_set_get_size(str_set);
-       new_str_set = string_set_init();
-       len = 0;
-       resultlen = 0;
-
-       /*
-        * Generate all possible pattern variable name initials as a set of
-        * StringInfo named "new_str_set".  For example, if we have two rows
-        * having "ab" (row 0) and "ac" (row 1) in the input str_set, 
new_str_set
-        * will have set of StringInfo "aa", "ac", "ba" and "bc" in the end.
-        */
-       elog(DEBUG1, "pattern: %s set_size: %d", pattern, set_size);
-       for (index = 0; index < set_size; index++)
-       {
-               StringInfo      str;    /* search target row */
-               char    *p;
-               int             old_set_size;
-               int             i;
-
-               elog(DEBUG1, "index: %d", index);
-
-               if (index == 0)
-               {
-                       /* copy variables in row 0 */
-                       str = string_set_get(str_set, index);
-                       p = str->data;
-
-                       /*
-                        * Loop over each new pattern variable char.
-                        */
-                       while (*p)
-                       {
-                               StringInfo      new = makeStringInfo();
-
-                               /* add pattern variable char */
-                               appendStringInfoChar(new, *p);
-                               /* add new one to string set */
-                               string_set_add(new_str_set, new);
-                               elog(DEBUG1, "old_str: NULL new_str: %s", 
new->data);
-                               p++;    /* next pattern variable */
-                       }
-               }
-               else    /* index != 0 */
-               {
-                       old_str_set = new_str_set;
-                       new_str_set = string_set_init();
-                       str = string_set_get(str_set, index);
-                       old_set_size = string_set_get_size(old_str_set);
-
-                       /*
-                        * Loop over each rows in the previous result set.
-                        */
-                       for (i = 0; i < old_set_size ; i++)
-                       {
-                               StringInfo      new;
-                               char    last_old_char;
-                               int             old_str_len;
-                               StringInfo      old = 
string_set_get(old_str_set, i);
-
-                               p = old->data;
-                               old_str_len = strlen(p);
-                               if (old_str_len > 0)
-                                       last_old_char = p[old_str_len - 1];
-                               else
-                                       last_old_char = '\0';
-
-                               /* Is this old set freezed? */
-                               if (last_old_char == FREEZED_CHAR)
-                               {
-                                       /* if shorter match. we can discard it 
*/
-                                       if ((old_str_len - 1) < resultlen)
-                                       {
-                                               elog(DEBUG1, "discard this old 
set because shorter match: %s", old->data);
-                                               continue;
-                                       }
-
-                                       elog(DEBUG1, "keep this old set: %s", 
old->data);
-                                       new = makeStringInfo();
-                                       appendStringInfoString(new, old->data);
-                                       string_set_add(new_str_set, new);
-                                       continue;
-                               }
-                               /* Can this old set be discarded? */
-                               else if (last_old_char == DISCARD_CHAR)
-                               {
-                                       elog(DEBUG1, "discard this old set: 
%s", old->data);
-                                       continue;
-                               }
-
-                               elog(DEBUG1, "str->data: %s", str->data);
-
-                               /*
-                                * loop over each pattern variable initial char 
in the input
-                                * set.
-                                */
-                               for (p = str->data; *p; p++)
-                               {
-                                       /*
-                                        * Optimization.  Check if the row's 
pattern variable
-                                        * initial character position is 
greater than or equal to
-                                        * the old set's last pattern variable 
initial character
-                                        * position. For example, if the old 
set's last pattern
-                                        * variable initials are "ab", then the 
new pattern
-                                        * variable initial can be "b" or "c" 
but can not be "a",
-                                        * if the initials in PATTERN is 
something like "a b c" or
-                                        * "a b+ c+" etc.  This optimization is 
possible when we
-                                        * only allow "+" quantifier.
-                                        */
-                                       if (variable_pos_compare(variable_pos, 
last_old_char, *p))
-                                       {                                       
        
-                                               /* copy source string */
-                                               new = makeStringInfo();
-                                               appendStringInfoString(new, 
old->data);
-                                               /* add pattern variable char */
-                                               appendStringInfoChar(new, *p);
-                                               elog(DEBUG1, "old_str: %s 
new_str: %s", old->data, new->data);
-
-                                               /*
-                                                * Adhoc optimization. If the 
first letter in the
-                                                * input string is the first 
and second position one
-                                                * and there's no associated 
quatifier '+', then we
-                                                * can dicard the input because 
there's no chace to
-                                                * expand the string further.
-                                                *
-                                                * For example, pattern "abc" 
cannot match "aa".
-                                                */
-                                               elog(DEBUG1, "pattern[1]:%c 
pattern[2]:%c new[0]:%c new[1]:%c",
-                                                        pattern[1], 
pattern[2], new->data[0], new->data[1]);
-                                               if (pattern[1] == new->data[0] 
&&
-                                                       pattern[1] == 
new->data[1] &&
-                                                       pattern[2] != '+' &&
-                                                       pattern[1] != 
pattern[2])
-                                               {
-                                                       elog(DEBUG1, "discard 
this new data: %s",
-                                                               new->data);
-                                                       pfree(new->data);
-                                                       pfree(new);
-                                                       continue;
-                                               }
-
-                                               /* add new one to string set */
-                                               string_set_add(new_str_set, 
new);
-                                       }
-                                       else
-                                       {
-                                               /*
-                                                * We are freezing this pattern 
string.  Since there's
-                                                * no chance to expand the 
string further, we perform
-                                                * pattern matching against the 
string. If it does not
-                                                * match, we can discard it.
-                                                */
-                                               len = do_pattern_match(pattern, 
old->data);
-
-                                               if (len <= 0)
-                                               {
-                                                       /* no match. we can 
discard it */
-                                                       continue;
-                                               }
-
-                                               else if (len <= resultlen)
-                                               {
-                                                       /* shorter match. we 
can discard it */
-                                                       continue;
-                                               }
-                                               else
-                                               {
-                                                       /* match length is the 
longest so far */
-
-                                                       int             
new_index;
-
-                                                       /* remember the longest 
match */
-                                                       resultlen = len;
-
-                                                       /* freeze the pattern 
string */
-                                                       new = makeStringInfo();
-                                                       
appendStringInfoString(new, old->data);
-                                                       /* add freezed mark */
-                                                       
appendStringInfoChar(new, FREEZED_CHAR);
-                                                       elog(DEBUG1, "old_str: 
%s new_str: %s", old->data, new->data);
-                                                       
string_set_add(new_str_set, new);
-
-                                                       /*
-                                                        * Search new_str_set 
to find out freezed
-                                                        * entries that have 
shorter match length.
-                                                        * Mark them as 
"discard" so that they are
-                                                        * discarded in the 
next round.
-                                                        */
-
-                                                       /* new_index_size 
should be the one before */
-                                                       new_str_size = 
string_set_get_size(new_str_set) - 1;
-
-                                                       /* loop over 
new_str_set */
-                                                       for (new_index = 0; 
new_index < new_str_size; new_index++)
-                                                       {
-                                                               char    
new_last_char;
-                                                               int             
new_str_len;
-
-                                                               new = 
string_set_get(new_str_set, new_index);
-                                                               new_str_len = 
strlen(new->data);
-                                                               if (new_str_len 
> 0)
-                                                               {
-                                                                       
new_last_char = new->data[new_str_len - 1];
-                                                                       if 
(new_last_char == FREEZED_CHAR &&
-                                                                               
(new_str_len - 1) <= len)
-                                                                       {
-                                                                               
/* mark this set to discard in the next round */
-                                                                               
appendStringInfoChar(new, DISCARD_CHAR);
-                                                                               
elog(DEBUG1, "add discard char: %s", new->data);
-                                                                       }
-                                                               }
-                                                       }
-                                               }
-                                       }
-                               }
-                       }
-                       /* we no longer need old string set */
-                       string_set_discard(old_str_set);
-               }
-       }
-
-       /*
-        * Perform pattern matching to find out the longest match.
-        */
-       new_str_size = string_set_get_size(new_str_set);
-       elog(DEBUG1, "new_str_size: %d", new_str_size);
-       len = 0;
-       resultlen = 0;
-
-       for (index = 0; index < new_str_size; index++)
-       {
-               StringInfo      s;
-
-               s = string_set_get(new_str_set, index);
-               if (s == NULL)
-                       continue;       /* no data */
-
-               elog(DEBUG1, "target string: %s", s->data);
-
-               len = do_pattern_match(pattern, s->data);
-               if (len > resultlen)
-               {
-                       /* remember the longest match */
-                       resultlen = len;
-
-                       /*
-                        * If the size of result set is equal to the number of 
rows in the
-                        * set, we are done because it's not possible that the 
number of
-                        * matching rows exceeds the number of rows in the set.
-                        */
-                       if (resultlen >= set_size)
-                               break;
-               }
-       }
-
-       /* we no longer need new string set */
-       string_set_discard(new_str_set);
-
-       return resultlen;
-}
-
-/*
- * do_pattern_match
- * perform pattern match using pattern against encoded_str.
- * returns matching number of rows if matching is succeeded.
- * Otherwise returns 0.
- */
-static
-int do_pattern_match(char *pattern, char *encoded_str)
-{
-       Datum   d;
-       text    *res;
-       char    *substr;
-       int             len = 0;
-       text    *pattern_text, *encoded_str_text;
-
-       pattern_text = cstring_to_text(pattern);
-       encoded_str_text = cstring_to_text(encoded_str);
-
-       /*
-        * We first perform pattern matching using regexp_instr, then call
-        * textregexsubstr to get matched substring to know how long the
-        * matched string is. That is the number of rows in the reduced window
-        * frame.  The reason why we can't call textregexsubstr in the first
-        * place is, it errors out if pattern does not match.
-        */
-       if (DatumGetInt32(DirectFunctionCall2Coll(regexp_instr, 
DEFAULT_COLLATION_OID,
-                                                                               
          PointerGetDatum(encoded_str_text),
-                                                                               
          PointerGetDatum(pattern_text))))
-       {
-               d = DirectFunctionCall2Coll(textregexsubstr,
-                                                                       
DEFAULT_COLLATION_OID,
-                                                                       
PointerGetDatum(encoded_str_text),
-                                                                       
PointerGetDatum(pattern_text));
-               if (d != 0)
-               {
-                       res = DatumGetTextPP(d);
-                       substr = text_to_cstring(res);
-                       len = strlen(substr);
-                       pfree(substr);
-               }
-       }
-       pfree(encoded_str_text);
-       pfree(pattern_text);
-
-       return len;
-}
-
-
-/*
- * Evaluate expression associated with PATTERN variable vname.
- * relpos is relative row position in a frame (starting from 0).
- * "quantifier" is the quatifier part of the PATTERN regular expression.
- * Currently only '+' is allowed.
- * result is out paramater representing the expression evaluation result
- * is true of false.
- * Return values are:
- * >=0: the last match absolute row position
- * other wise out of frame.
- */
-static
-int64 evaluate_pattern(WindowObject winobj, int64 current_pos,
-                                               char *vname, StringInfo 
encoded_str, bool *result)
+static bool
+evaluate_pattern(WindowObject winobj, int64 current_pos, char *vname)
 {
        WindowAggState  *winstate = winobj->winstate;
        ExprContext             *econtext = winstate->ss.ps.ps_ExprContext;
-       ListCell                *lc1, *lc2, *lc3;
+       ListCell                *lc1, *lc2;
        ExprState               *pat;
        Datum                   eval_result;
-       bool                    out_of_frame = false;
        bool                    isnull;
        TupleTableSlot *slot;
+       bool                    result;
 
-       forthree (lc1, winstate->defineVariableList, lc2, 
winstate->defineClauseList, lc3, winstate->defineInitial)
+       forboth (lc1, winstate->defineVariableList, lc2, 
winstate->defineClauseList)
        {
-               char    initial;
                char    *name = strVal(lfirst(lc1));
 
                if (strcmp(vname, name))
                        continue;
 
-               initial = *(strVal(lfirst(lc3)));
-
                /* set expression to evaluate */
                pat = lfirst(lc2);
 
                /* get current, previous and next tuples */
                if (!get_slots(winobj, current_pos))
                {
-                       out_of_frame = true;
+                       /* out of frame */
+                       return false;
+               }
+
+               /* evaluate the expression */
+               eval_result = ExecEvalExpr(pat, econtext, &isnull);
+               if (isnull)
+               {
+                       /* expression is NULL */
+                       elog(DEBUG1, "expression for %s is NULL at row: " 
INT64_FORMAT, vname, current_pos);
+                       result = false;
                }
                else
                {
-                       /* evaluate the expression */
-                       eval_result = ExecEvalExpr(pat, econtext, &isnull);
-                       if (isnull)
+                       if (!DatumGetBool(eval_result))
                        {
-                               /* expression is NULL */
-                               elog(DEBUG1, "expression for %s is NULL at row: 
" INT64_FORMAT, vname, current_pos);
-                               *result = false;
+                               /* expression is false */
+                               elog(DEBUG1, "expression for %s is false at 
row: " INT64_FORMAT, vname, current_pos);
+                               result = false;
                        }
                        else
                        {
-                               if (!DatumGetBool(eval_result))
-                               {
-                                       /* expression is false */
-                                       elog(DEBUG1, "expression for %s is 
false at row: " INT64_FORMAT, vname, current_pos);
-                                       *result = false;
-                               }
-                               else
-                               {
-                                       /* expression is true */
-                                       elog(DEBUG1, "expression for %s is true 
at row: " INT64_FORMAT, vname, current_pos);
-                                       appendStringInfoChar(encoded_str, 
initial);
-                                       *result = true;
-                               }
+                               /* expression is true */
+                               elog(DEBUG1, "expression for %s is true at row: 
" INT64_FORMAT, vname, current_pos);
+                               result = true;
                        }
-
-                       slot = winstate->temp_slot_1;
-                       if (slot != winstate->null_slot)
-                               ExecClearTuple(slot);
-                       slot = winstate->prev_slot;
-                       if (slot != winstate->null_slot)
-                               ExecClearTuple(slot);
-                       slot = winstate->next_slot;
-                       if (slot != winstate->null_slot)
-                               ExecClearTuple(slot);
-
-                       break;
                }
 
-               if (out_of_frame)
-               {
-                       *result = false;
-                       return -1;
-               }
+               slot = winstate->temp_slot_1;
+               if (slot != winstate->null_slot)
+                       ExecClearTuple(slot);
+               slot = winstate->prev_slot;
+               if (slot != winstate->null_slot)
+                       ExecClearTuple(slot);
+               slot = winstate->next_slot;
+               if (slot != winstate->null_slot)
+                       ExecClearTuple(slot);
+
+               return result;
        }
-       return current_pos;
+
+       pg_unreachable();
 }
 
 /*
@@ -4738,211 +4257,3 @@ bool get_slots(WindowObject winobj, int64 current_pos)
        }
        return true;
 }
-
-/*
- * Return pattern variable initial character
- * matching with pattern variable name vname.
- * If not found, return 0.
- */
-static
-char   pattern_initial(WindowAggState *winstate, char *vname)
-{
-       char            initial;
-       char            *name;
-       ListCell        *lc1, *lc2;
-
-       forboth (lc1, winstate->defineVariableList, lc2, 
winstate->defineInitial)
-       {
-               name = strVal(lfirst(lc1));                             /* 
DEFINE variable name */
-               initial = *(strVal(lfirst(lc2)));               /* DEFINE 
variable initial */
-
-
-               if (!strcmp(name, vname))
-                               return initial;                                 
/* found */
-       }
-       return 0;
-}
-
-/*
- * string_set_init
- * Create dynamic set of StringInfo.
- */
-static
-StringSet *string_set_init(void)
-{
-/* Initial allocation size of str_set */
-#define STRING_SET_ALLOC_SIZE  1024
-
-       StringSet       *string_set;
-       Size            set_size;
-
-       string_set = palloc0(sizeof(StringSet));
-       string_set->set_index = 0;
-       set_size = STRING_SET_ALLOC_SIZE;
-       string_set->str_set = palloc(set_size * sizeof(StringInfo));
-       string_set->set_size = set_size;
-
-       return string_set;
-}
-
-/*
- * Add StringInfo str to StringSet string_set.
- */
-static
-void string_set_add(StringSet *string_set, StringInfo str)
-{
-       Size    set_size;
-
-       set_size = string_set->set_size;
-       if (string_set->set_index >= set_size)
-       {
-               set_size *= 2;
-               string_set->str_set = repalloc(string_set->str_set,
-                                                                          
set_size * sizeof(StringInfo));
-               string_set->set_size = set_size;
-       }
-
-       string_set->str_set[string_set->set_index++] = str;
-
-       return;
-}
-
-/*
- * Returns StringInfo specified by index.
- * If there's no data yet, returns NULL.
- */
-static
-StringInfo string_set_get(StringSet *string_set, int index)
-{
-       /* no data? */
-       if (index == 0 && string_set->set_index == 0)
-               return NULL;
-
-       if (index < 0 ||index >= string_set->set_index)
-               elog(ERROR, "invalid index: %d", index);
-
-       return string_set->str_set[index];
-}
-
-/*
- * Returns the size of StringSet.
- */
-static
-int string_set_get_size(StringSet *string_set)
-{
-       return string_set->set_index;
-}
-
-/*
- * Discard StringSet.
- * All memory including StringSet itself is freed.
- */
-static
-void string_set_discard(StringSet *string_set)
-{
-       int             i;
-
-       for (i = 0; i < string_set->set_index; i++)
-       {
-               StringInfo str = string_set->str_set[i];
-               pfree(str->data);
-               pfree(str);
-       }
-       pfree(string_set->str_set);
-       pfree(string_set);
-}
-
-/*
- * Create and initialize variable postion structure
- */
-static
-VariablePos *variable_pos_init(void)
-{
-       VariablePos     *variable_pos;
-
-       variable_pos = palloc(sizeof(VariablePos) * NUM_ALPHABETS);
-       MemSet(variable_pos, -1, sizeof(VariablePos) * NUM_ALPHABETS);
-       return variable_pos;
-}
-
-/*
- * Register pattern variable whose initial is initial into postion index.
- * pos is position of initial.
- * If pos is already registered, register it at next empty slot.
- */
-static
-void variable_pos_register(VariablePos *variable_pos, char initial, int pos)
-{
-       int             index = initial - 'a';
-       int             slot;
-       int             i;
-
-       if (pos < 0 || pos > NUM_ALPHABETS)
-               elog(ERROR, "initial is not valid char: %c", initial);
-
-       for (i = 0; i < NUM_ALPHABETS; i++)
-       {
-               slot = variable_pos[index].pos[i];
-               if (slot < 0)
-               {
-                       /* empty slot found */
-                       variable_pos[index].pos[i] = pos;
-                       return;
-               }
-       }
-       elog(ERROR, "no empty slot for initial: %c", initial);
-}
-
-/*
- * Returns true if initial1 can be followed by initial2
- */
-static
-bool variable_pos_compare(VariablePos *variable_pos, char initial1, char 
initial2)
-{
-       int     index1, index2;
-       int pos1, pos2;
-
-       for (index1 = 0; ; index1++)
-       {
-               pos1 = variable_pos_fetch(variable_pos, initial1, index1);
-               if (pos1 < 0)
-                       break;
-
-               for (index2 = 0; ; index2++)
-               {
-                       pos2 = variable_pos_fetch(variable_pos, initial2, 
index2);
-                       if (pos2 < 0)
-                               break;
-                       if (pos1 <= pos2)
-                               return true;
-               }
-       }
-       return false;
-}
-
-/*
- * Fetch position of pattern variable whose initial is initial, and whose index
- * is index. If no postion was registered by initial, index, returns -1.
- */
-static
-int variable_pos_fetch(VariablePos *variable_pos, char initial, int index)
-{
-       int             pos = initial - 'a';
-
-       if (pos < 0 || pos > NUM_ALPHABETS)
-               elog(ERROR, "initial is not valid char: %c", initial);
-
-       if (index < 0 || index > NUM_ALPHABETS)
-               elog(ERROR, "index is not valid: %d", index);
-
-       return variable_pos[pos].pos[index];
-}
-
-/*
- * Discard VariablePos
- */
-static
-void variable_pos_discard(VariablePos *variable_pos)
-{
-       pfree(variable_pos);
-}
diff --git a/src/backend/optimizer/plan/createplan.c 
b/src/backend/optimizer/plan/createplan.c
index 469fcd156b..c2aba23d17 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -288,7 +288,6 @@ static WindowAgg *make_windowagg(List *tlist, Index winref,
                                                                 Oid 
startInRangeFunc, Oid endInRangeFunc,
                                                                 Oid 
inRangeColl, bool inRangeAsc, bool inRangeNullsFirst, List *runCondition,
                                                                 RPSkipTo 
rpSkipTo, List *patternVariable, List *patternRegexp, List *defineClause,
-                                                                List 
*defineInitial,
                                                                 List *qual, 
bool topWindow, Plan *lefttree);
 static Group *make_group(List *tlist, List *qual, int numGroupCols,
                                                 AttrNumber *grpColIdx, Oid 
*grpOperators, Oid *grpCollations,
@@ -2703,7 +2702,6 @@ create_windowagg_plan(PlannerInfo *root, WindowAggPath 
*best_path)
                                                  wc->patternVariable,
                                                  wc->patternRegexp,
                                                  wc->defineClause,
-                                                 wc->defineInitial,
                                                  best_path->qual,
                                                  best_path->topwindow,
                                                  subplan);
@@ -6609,7 +6607,6 @@ make_windowagg(List *tlist, Index winref,
                           Oid startInRangeFunc, Oid endInRangeFunc,
                           Oid inRangeColl, bool inRangeAsc, bool 
inRangeNullsFirst, List *runCondition,
                           RPSkipTo rpSkipTo, List *patternVariable, List 
*patternRegexp, List *defineClause,
-                          List *defineInitial,
                           List *qual, bool topWindow, Plan *lefttree)
 {
        WindowAgg  *node = makeNode(WindowAgg);
@@ -6640,7 +6637,6 @@ make_windowagg(List *tlist, Index winref,
        node->patternVariable = patternVariable;
        node->patternRegexp = patternRegexp;
        node->defineClause = defineClause;
-       node->defineInitial = defineInitial;
 
        plan->targetlist = tlist;
        plan->lefttree = lefttree;
diff --git a/src/backend/parser/parse_clause.c 
b/src/backend/parser/parse_clause.c
index 9c347216f7..ab81f5cbd8 100644
--- a/src/backend/parser/parse_clause.c
+++ b/src/backend/parser/parse_clause.c
@@ -3875,16 +3875,11 @@ transformRPR(ParseState *pstate, WindowClause *wc, 
WindowDef *windef, List **tar
 static List *
 transformDefineClause(ParseState *pstate, WindowClause *wc, WindowDef *windef, 
List **targetlist)
 {
-       /* DEFINE variable name initials */
-       static  char    *defineVariableInitials = "abcdefghijklmnopqrstuvwxyz";
-
        ListCell                *lc, *l;
        ResTarget               *restarget, *r;
        List                    *restargets;
        List                    *defineClause;
        char                    *name;
-       int                             initialLen;
-       int                             i;
 
        /*
         * If Row Definition Common Syntax exists, DEFINE clause must exist.
@@ -3998,33 +3993,7 @@ transformDefineClause(ParseState *pstate, WindowClause 
*wc, WindowDef *windef, L
                restargets = lappend(restargets, restarget);
        }
        list_free(restargets);
-
-       /*
-        * Create list of row pattern DEFINE variable name's initial.
-        * We assign [a-z] to them (up to 26 variable names are allowed).
-        */
        restargets = NIL;
-       i = 0;
-       initialLen = strlen(defineVariableInitials);
-
-       foreach(lc, windef->rpCommonSyntax->rpDefs)
-       {
-               char    initial[2];
-
-               restarget = (ResTarget *)lfirst(lc);
-               name = restarget->name;
-
-               if (i >= initialLen)
-               {
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_SYNTAX_ERROR),
-                                        errmsg("number of row pattern 
definition variable names exceeds %d", initialLen),
-                                        parser_errposition(pstate, 
exprLocation((Node *)restarget))));
-               }
-               initial[0] = defineVariableInitials[i++];
-               initial[1] = '\0';
-               wc->defineInitial = lappend(wc->defineInitial, 
makeString(pstrdup(initial)));
-       }
 
        defineClause = transformTargetList(pstate, 
windef->rpCommonSyntax->rpDefs,
                                                           
EXPR_KIND_RPR_DEFINE);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 28d098b1cf..8e77646e8e 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2530,7 +2530,6 @@ typedef struct WindowAggState
        List       *defineVariableList; /* list of row pattern definition 
variables (list of String) */
        List       *defineClauseList;   /* expression for row pattern definition
                                                                         * 
search conditions ExprState list */
-       List       *defineInitial;              /* list of row pattern 
definition variable initials (list of String) */
 
        MemoryContext partcontext;      /* context for partition-lifespan data 
*/
        MemoryContext aggcontext;       /* shared context for aggregate working 
data */
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 31b04d6919..613f746861 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -1564,8 +1564,6 @@ typedef struct WindowClause
        bool            initial;                /* true if <row pattern initial 
or seek> is initial */
        /* Row Pattern DEFINE clause (list of TargetEntry) */
        List            *defineClause;
-       /* Row Pattern DEFINE variable initial names (list of String) */
-       List            *defineInitial;
        /* Row Pattern PATTERN variable name (list of String) */
        List            *patternVariable;
        /* Row Pattern PATTERN regular expression quantifier ('+' or ''. list 
of String) */
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index e48b59517d..33f4aa4b72 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -1108,9 +1108,6 @@ typedef struct WindowAgg
        /* Row Pattern DEFINE clause (list of TargetEntry) */
        List       *defineClause;
 
-       /* Row Pattern DEFINE variable initial names (list of String) */
-       List            *defineInitial;
-
        /*
         * false for all apart from the WindowAgg that's closest to the root of
         * the plan
-- 
2.39.2

From e413583502bc7602ad1803fa3a8d74d265147002 Mon Sep 17 00:00:00 2001
From: Jacob Champion <champio...@gmail.com>
Date: Mon, 23 Oct 2023 12:31:51 -0700
Subject: [PATCH 1/2] squash! Row pattern recognition patch (tests).

Add long-table tests.
---
 src/test/regress/expected/rpr.out | 68 +++++++++++++++++++++++++++++++
 src/test/regress/sql/rpr.sql      | 56 +++++++++++++++++++++++++
 2 files changed, 124 insertions(+)

diff --git a/src/test/regress/expected/rpr.out 
b/src/test/regress/expected/rpr.out
index 8f8254d3b2..806dee5ce6 100644
--- a/src/test/regress/expected/rpr.out
+++ b/src/test/regress/expected/rpr.out
@@ -635,6 +635,74 @@ DOWN AS price < PREV(price)
  company2 | 07-10-2023 |  1300 |             |            |      |      |      
|                       |     0
 (20 rows)
 
+--
+-- Bigger datasets
+--
+CREATE TEMP TABLE long_stock (
+       company TEXT,
+       tdate DATE,
+       price INTEGER
+);
+INSERT INTO long_stock SELECT 'company1', DATE '2023-07-01' + i, 200 + 100 * 
sin(i)
+                         FROM generate_series(1, 1000) i;
+INSERT INTO long_stock SELECT 'company2', DATE '2023-07-01' + i, 300 + 200 * 
sin(i + 3)
+                         FROM generate_series(1, 1000) i;
+SELECT company, count(*), min(price), round(avg(price)) AS avg, max(price)
+  FROM long_stock GROUP BY company;
+ company  | count | min | avg | max 
+----------+-------+-----+-----+-----
+ company1 |  1000 | 100 | 200 | 300
+ company2 |  1000 | 100 | 300 | 500
+(2 rows)
+
+-- long test using PREV. Expect approximately 1000 / (2*pi) = 159 periods of 
the
+-- sinusoids to match.
+WITH q AS (
+  SELECT company, tdate, first_value(price) OVER w
+   FROM long_stock
+   WINDOW w AS (
+   PARTITION BY company
+   ORDER BY tdate
+   ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
+   INITIAL
+   PATTERN (START UP+ DOWN+)
+   DEFINE
+    START AS TRUE,
+    UP AS price >= PREV(price),
+    DOWN AS price <= PREV(price)
+  )
+) SELECT company, count(first_value) AS matches
+   FROM q GROUP BY company;
+ company  | matches 
+----------+---------
+ company1 |     159
+ company2 |     159
+(2 rows)
+
+-- match everything, with multiple matching variables per row (stresses
+-- implementations susceptible to Cartesian explosion)
+WITH q AS (
+  SELECT company, tdate, first_value(price) OVER w
+   FROM long_stock
+   WINDOW w AS (
+   PARTITION BY company
+   ORDER BY tdate
+   ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
+   INITIAL
+   PATTERN (A+ B+ C+)
+   DEFINE
+    A AS TRUE,
+    B AS TRUE,
+    C AS TRUE
+  )
+) SELECT company, count(first_value) AS matches
+   FROM q GROUP BY company;
+ company  | matches 
+----------+---------
+ company1 |       1
+ company2 |       1
+(2 rows)
+
 --
 -- Error cases
 --
diff --git a/src/test/regress/sql/rpr.sql b/src/test/regress/sql/rpr.sql
index 38309652f9..51c9245d3b 100644
--- a/src/test/regress/sql/rpr.sql
+++ b/src/test/regress/sql/rpr.sql
@@ -271,6 +271,62 @@ UP AS price > PREV(price),
 DOWN AS price < PREV(price)
 );
 
+--
+-- Bigger datasets
+--
+
+CREATE TEMP TABLE long_stock (
+       company TEXT,
+       tdate DATE,
+       price INTEGER
+);
+
+INSERT INTO long_stock SELECT 'company1', DATE '2023-07-01' + i, 200 + 100 * 
sin(i)
+                         FROM generate_series(1, 1000) i;
+INSERT INTO long_stock SELECT 'company2', DATE '2023-07-01' + i, 300 + 200 * 
sin(i + 3)
+                         FROM generate_series(1, 1000) i;
+
+SELECT company, count(*), min(price), round(avg(price)) AS avg, max(price)
+  FROM long_stock GROUP BY company;
+
+-- long test using PREV. Expect approximately 1000 / (2*pi) = 159 periods of 
the
+-- sinusoids to match.
+WITH q AS (
+  SELECT company, tdate, first_value(price) OVER w
+   FROM long_stock
+   WINDOW w AS (
+   PARTITION BY company
+   ORDER BY tdate
+   ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
+   INITIAL
+   PATTERN (START UP+ DOWN+)
+   DEFINE
+    START AS TRUE,
+    UP AS price >= PREV(price),
+    DOWN AS price <= PREV(price)
+  )
+) SELECT company, count(first_value) AS matches
+   FROM q GROUP BY company;
+
+-- match everything, with multiple matching variables per row (stresses
+-- implementations susceptible to Cartesian explosion)
+WITH q AS (
+  SELECT company, tdate, first_value(price) OVER w
+   FROM long_stock
+   WINDOW w AS (
+   PARTITION BY company
+   ORDER BY tdate
+   ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
+   INITIAL
+   PATTERN (A+ B+ C+)
+   DEFINE
+    A AS TRUE,
+    B AS TRUE,
+    C AS TRUE
+  )
+) SELECT company, count(first_value) AS matches
+   FROM q GROUP BY company;
+
 --
 -- Error cases
 --
-- 
2.39.2

Reply via email to