A serious stab at regexes

Brent Dax Sun, 04 Nov 2001 00:23:38 -0800

Okay, this bunch of ops is a serious attempt at regular expressions.  I
had a discussion with japhy on this in the Monastery
(http://www.perlmonks.org/index.pl?node_id=122784), and I've come up
with something flexible enough to actually (maybe) work.  Attached is a
patch to modify core.ops and add re.h (defines data structures and such)
and t/op/re.t (six tests).  All tests, including the new ones, pass.


--Brent Dax
[EMAIL PROTECTED]
Configure pumpking for Perl 6

When I take action, I’m not going to fire a $2 million missile at a $10
empty tent and hit a camel in the butt.
    --Dubya

--- core.ops.old        Sun Nov  4 03:07:31 2001
+++ core.ops    Sun Nov  4 03:05:42 2001
@@ -2,6 +2,8 @@
 ** core.ops
 */
 
+#include <parrot/re.h>
+
 =head1 NAME
 
 core.ops
@@ -1959,6 +1961,357 @@
 
 ###############################################################################
 
+=head2 Regular expression operations
+
+These operations are used by the regular expression engine.  Unless 
+otherwise noted, any regexp opcode which takes an integer constant as its 
+last argument branches to that address if the op fails to match.
+
+=over 4
+
+=cut
+
+########################################
+
+=item B<reMatch>(ic, s)
+
+=item B<reMatch>(ic, sc)
+
+Sets the string in $2 as the string to match against and branches to the 
+regular expression at the address specified as $1.
+
+=item B<reMatch>(i, s)
+
+=item B<reMatch>(i, sc)
+
+Same as the ic variant, but jumps to $1 instead of branching.
+
+=cut
+
+AUTO_OP reMatch(ic, s|sc) {
+       cur_re=mem_sys_allocate(sizeof(re_info));
+       
+       cur_re->string=$2;
+       cur_re->flags=0;
+       cur_re->index=0;
+       cur_re->minlength=0;
+       
+    /*
+    ** Allocate a stack
+    ** XXX There ought to be a function to do this, like
+       **  stack_make(interpreter, &cur_re->stack_base, &cur_re->stack_top) or 
+something 
+       */
+    cur_re->stack_base = mem_allocate_aligned(sizeof(struct StackChunk));
+    cur_re->stack_top = &cur_re->stack_base->entry[0];
+    cur_re->stack_base->used = 0;
+    cur_re->stack_base->free = STACK_CHUNK_DEPTH;
+    cur_re->stack_base->next = NULL;
+    cur_re->stack_base->prev = NULL;
+    
+    /* push the current location onto the call stack--we're doing the equivalent of a 
+sub call */
+    push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + 3,  
+STACK_ENTRY_DESTINATION, NULL);
+    
+    RETREL($1);
+}
+
+AUTO_OP reMatch(i, s|sc) {
+       cur_re=mem_sys_allocate(sizeof(re_info));
+
+       cur_re->string=$2;
+       cur_re->flags=0;
+       cur_re->index=0;
+       cur_re->minlength=0;
+       
+    /*
+    ** Allocate a stack
+    ** XXX There ought to be a function to do this, like
+       **  stack_make(interpreter, &cur_re->stack_base, &cur_re->stack_top) or 
+something 
+       */
+       cur_re->stack_base = mem_allocate_aligned(sizeof(struct StackChunk));
+    cur_re->stack_top = &cur_re->stack_base->entry[0];
+    cur_re->stack_base->used = 0;
+    cur_re->stack_base->free = STACK_CHUNK_DEPTH;
+    cur_re->stack_base->next = NULL;
+    cur_re->stack_base->prev = NULL;
+    
+    push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + 3,  
+STACK_ENTRY_DESTINATION, NULL);
+    
+    /* jump to the first argument */
+    RETABS((opcode_t *)$1);
+}
+
+########################################
+
+=item B<reFlags>(s)
+
+=item B<reFlags>(sc)
+
+Sets the regular expression's flags.  'i' sets the flag 
+RE_case_insensitive_FLAG, 's' sets the flag
+RE_single_line_FLAG, and 'm' sets the flag 
+RE_multiline_FLAG.  Currently only 's' is implemented.
+
+=cut
+
+AUTO_OP reFlags(s|sc) {
+       int i;
+       char ch;
+       
+       for(i=0; i < string_length($1); i++) {
+               /*
+               ** XXX this is a REALLY naughty thing to do--I 
+               **  shouldn't poke around inside the string like this 
+               */
+               ch=((char *)$1->bufstart)[i];
+               
+               switch(ch) {
+                       case 'i':
+                               fprintf(stderr, "Warning: RE option /m not yet 
+implemented");
+                               RE_case_insensitive_SET(cur_re);        break;
+                       case 's':
+                               RE_single_line_SET(cur_re);             break;
+                       case 'm':
+                               fprintf(stderr, "Warning: RE option /m not yet 
+implemented");
+                               RE_multiline_SET(cur_re);               break;
+                       default:
+                               fprintf(stderr, "Warning: unrecognized RE option /%c", 
+ch);
+               }
+       }
+}
+
+########################################
+
+=item B<reFlags>(s)
+
+=item B<reFlags>(sc)
+
+Sets the minimum number of characters that must be left in the 
+string for a match to be possible.  For example, the expression 
+/fo*bar/ must have at least 4 characters in the string left to
+match; the expression /fo+bar/ requires five characters.  This
+information is used to optimize calls to B<reAdvance>.
+
+=cut
+
+AUTO_OP reMinlength(i|ic) {
+       cur_re->minlength=$1;
+}
+
+########################################
+
+=item B<reLiteral>(s, ic)
+
+=item B<reLiteral>(sc, ic)
+
+Matches the string in $1 literally; in other words, if $1="bar",
+this op will match the exact string "bar" and nothing else.
+(This should be sensitive to RE_case_insensitive_FLAG but isn't 
+currently.)
+=cut
+
+AUTO_OP reLiteral(s|sc, ic) {
+       STRING * arg=$1;
+       STRING * cmp=string_make(interpreter, "", 0, 0, 0, 0);
+               
+
+       if(cur_re->index >= string_length(cur_re->string)) {
+               RETREL($2);
+       }
+
+       string_substr(
+               interpreter,
+               cur_re->string,
+               cur_re->index,
+               string_length(arg),
+               &cmp
+       );
+
+       if(!string_compare(interpreter, arg, cmp)) {
+               cur_re->index += string_length(arg);
+       }
+       else {
+               RETREL($2);
+       }
+}
+
+########################################
+
+=item B<reOneof>(s, ic)
+
+=item B<reOneof>(sc, ic)
+
+Matches if the next character in the string being matched against
+is in $1.  (This should be sensitive to RE_case_insensitive_FLAG
+but isn't currently.)
+
+=cut
+
+AUTO_OP reOneof(s|sc, i|ic) {
+       int i;
+       STRING * arg=$1;
+       STRING * matchagainst=string_make(interpreter, "", 0, 0, 0, 0);
+       STRING * nextchar=string_make(interpreter, "", 0, 0, 0, 0);
+
+       if(cur_re->index >= string_length(cur_re->string)) {
+               RETREL($2);
+       }
+
+       string_substr(
+               interpreter,
+               cur_re->string,
+               cur_re->index,
+               1,
+               &matchagainst
+       );
+
+       for(i=0; i < string_length(arg); i++) {
+               string_substr(
+                       interpreter,
+                       arg,
+                       i,
+                       1,
+                       &nextchar
+               );
+
+               if(!string_compare(interpreter, matchagainst, nextchar)) {
+                       cur_re->index++;
+                       RETREL(*);
+               }
+       }
+       RETREL($2);
+}
+
+########################################
+
+=item B<reAnything>(ic)
+
+This behaves the same as '.' in a regular expression; if
+RE_single_line_FLAG is set, it matches any character, otherwise
+it matches only newline.
+
+=cut
+
+AUTO_OP reAnything(ic) {
+       STRING * newline=string_make(interpreter, "\n", strlen("\n"), 0, 0, 0);
+       STRING * cmp=string_make(interpreter, "", 0, 0, 0, 0);
+       
+       if(cur_re->index >= string_length(cur_re->string)) {
+               RETREL($1);
+       }
+
+       string_substr(
+               interpreter,
+               cur_re->string,
+               cur_re->index,
+               1,
+               &cmp
+       );
+       
+       if(RE_single_line_TEST(cur_re) || string_compare(interpreter, newline, cmp)) {
+               cur_re->index++;
+       }
+       else {
+               RETREL($1);
+       }
+}
+
+########################################
+
+=item B<reAdvance>(ic)
+
+This op skips forward one character in the string; it is used to 
+walk forward through the string.  For example, in the expression
+C<"afoobarz" =~ /fo*bar/>, the 'f' doesn't immediately match the 
+'a', so it skips forward a letter; this op provides that behavior.
+
+=cut
+
+AUTO_OP reAdvance(ic) {
+       cur_re->index++;
+
+       if(cur_re->index+cur_re->minlength >= string_length(cur_re->string)) {
+               RETREL($1);
+       }
+}
+
+########################################
+
+=item B<rePushindex>()
+
+This op pushes the current index onto the regular expression's stack.  It is
+used to remember indexes to backtrack to later.
+
+=cut
+
+AUTO_OP rePushindex() {
+       push_generic_entry(
+               interpreter,
+               &cur_re->stack_top,
+               &cur_re->index,
+               STACK_ENTRY_INT,
+               NULL
+       );
+}
+
+########################################
+
+=item B<rePopindex>()
+
+=item B<rePopindex>(ic)
+
+This op pops the regular expression's stack and sets the regular expression's
+index to the popped value.  It is used primarily to backtrack.
+
+=cut
+
+AUTO_OP rePopindex() {
+       if(stack_depth(interpreter, cur_re->stack_base)) {
+               pop_generic_entry(
+                       interpreter,
+                       &cur_re->stack_top,
+                       &cur_re->index,
+                       STACK_ENTRY_INT
+               );
+       }
+}
+
+AUTO_OP rePopindex(ic) {
+       if(stack_depth(interpreter, cur_re->stack_base)) {
+               pop_generic_entry(
+                       interpreter,
+                       &cur_re->stack_top,
+                       &cur_re->index,
+                       STACK_ENTRY_INT
+               );
+       }
+       else {
+               RETREL($1);
+       }
+}
+
+########################################
+
+=item B<reFinished>()
+
+This op does some cleanup so various data structures used by the regular
+expression engine will be garbage collected and jumps back to the next op
+after the reMatch that entered the regular expression.
+
+=cut
+
+
+AUTO_OP reFinished() {
+       opcode_t *dest;
+  
+       (void *)cur_re->stack_top=(void *)cur_re->stack_base=NULL;      /* so it'll be 
+GCed */
+       cur_re=NULL;                                                                   
+                         /* ditto            */
+
+       pop_generic_entry(interpreter, &interpreter->control_stack_top, &dest, 
+STACK_ENTRY_DESTINATION);
+       RETABS(dest);
+}
+
+###############################################################################
+
 =head1 COPYRIGHT
 
 Copyright (C) 2001 Yet Another Society. All rights reserved.

--- /dev/null   Wed Dec 31 16:00:00 1969
+++ include\parrot\re.h Sat Nov  3 23:31:50 2001
@@ -0,0 +1,94 @@
+#if !defined(PARROT_RE_H_GUARD)
+#define PARROT_RE_H_GUARD
+
+#include <parrot/parrot.h>
+#include <parrot/string.h>
+#include <parrot/stacks.h>
+
+/*
+** I'm stashing notes on the regex implementation here.
+** In Perl 5, the RE engine is built on a simple principle.  A printout of the
+** structure of the RE /fo*bar/ (obtained with -mre=debug) is below:
+**
+**   1: EXACT <f>(3)
+**   3: STAR(6)
+**   4:   EXACT <o>(0)
+**   6: EXACT <bar>(8)
+**   8: END(0)
+**
+** The basic principle is that, if the operation matches, we jump to the number
+** in the parenthesis.  In this RE system, the exact opposite is true; we jump
+** when we _fail_.
+**
+**
+**   RE:
+**     reFlags ""
+**     reMinlength 4
+**     
+**     branch $start
+**
+**   $advance:
+**     rePopindex
+**     reAdvance $fail
+**   $start:
+**     rePushindex
+**     reLiteral "f", $advance
+**   $findo:
+**     literal "o", $findbar
+**     rePushindex
+**     branch $findo
+**   $findbar:
+**     reLiteral "bar", $backtrack
+**     set I0, 1       #true
+**     reFinished
+**   $backtrack:
+**     rePopindex $advance
+**     branch $findbar
+**   $fail:
+**     set I0, 0       #false
+**     reFinished
+**
+** Although this looks like more code than the compact setup above, we're using
+** normal opcodes, so we have to be explicit.  There's no STAR op; the branch
+** implicitly does that, and the rePushindex helps it out.  Nothing is 'nested' 
+** within something else--backtracking has to be explicit, and the pushing/popping
+** must also be explicit.  Even the behavior of starting at the next spot in the 
+** string has to be explicitly laid out.  THESE ARE ALL SIDE EFFECTS OF THE FACT
+** WE'RE USING NORMAL OPCODES.  Further, these opcodes have very little communication
+** between them--a small structure (defined later on in this file) contains just the
+** string we're matching against, the index we're at right now, some metadata about
+** the RE, and a stack.  The size of the program isn't because we're jumping on 
+** false instead of true.  The only place that isn't a win is alternation.
+**
+** Okay, I'm done rambling.  Back to the code...
+*/
+
+typedef struct re_info {
+       STRING * string;
+       INTVAL index;
+       INTVAL flags;
+       INTVAL minlength;
+       
+       /* stack stuff */
+       struct StackChunk * stack_base;
+       struct Stack_Entry * stack_top;
+} re_info;
+
+re_info * cur_re;
+
+#define RE_case_insensitive_FLAG                       0x1
+#define RE_case_insensitive_TEST(info)                 info->flags &   
+RE_case_insensitive_FLAG
+#define RE_case_insensitive_SET(info)          info->flags |=  
+RE_case_insensitive_FLAG
+#define RE_case_insensitive_CLEAR(info)        info->flags &= 
+~RE_case_insensitive_FLAG
+
+#define RE_single_line_FLAG                                    0x2
+#define RE_single_line_TEST(info)                      info->flags &   
+RE_single_line_FLAG
+#define RE_single_line_SET(info)                       info->flags |=  
+RE_single_line_FLAG
+#define RE_single_line_CLEAR(info)                     info->flags &= 
+~RE_single_line_FLAG
+
+#define RE_multiline_FLAG                                      0x4
+#define RE_multiline_TEST(info)                                info->flags &   
+RE_multiline_FLAG
+#define RE_multiline_SET(info)                         info->flags |=  
+RE_multiline_FLAG
+#define RE_multiline_CLEAR(info)                       info->flags &= 
+~RE_multiline_FLAG
+
+#endif

--- /dev/null   Wed Dec 31 16:00:00 1969
+++ t\op\re.t   Sat Nov  3 17:53:26 2001
@@ -0,0 +1,105 @@
+#!perl -w
+
+use Parrot::Test tests => 6;
+
+output_is(<<'CODE', "1", "A is A");
+       reMatch RE, "A"
+       print I0
+       end
+
+RE:
+       reLiteral "A", $fail
+       set I0, 1
+       reFinished
+$fail:
+       set I0, 0
+       reFinished
+
+CODE
+
+output_is(<<'CODE', 0, "A is not B");
+       reMatch RE, "A"
+       print I0
+       end
+       
+RE:
+       reLiteral "B", $fail
+       set I0, 1
+       branch $end
+$fail:
+       set I0, 0
+$end:
+       reFinished
+CODE
+
+output_is(<<'CODE', 1, "advance-on-start works okay");
+       reMatch RE, "bab"
+       print I0
+       end
+       
+RE:
+       branch $start
+$advance:
+       rePopindex
+       reAdvance $fail
+$start:
+       rePushindex
+       reLiteral "a", $advance
+       set I0, 1
+       reFinished
+$fail:
+       set I0, 0
+       reFinished
+CODE
+
+output_is(<<'CODE', 0, "advance-on-start fails okay");
+       reMatch RE, "bxb"
+       print I0
+       end
+       
+RE:
+       branch $start
+$advance:
+       rePopindex
+       reAdvance $fail
+$start:
+       rePushindex
+       reLiteral "a", $advance
+       set I0, 1
+       reFinished
+$fail:
+       set I0, 0
+       reFinished
+CODE
+
+output_is(<<'CODE', "10", "dot works okay");
+       reMatch RE, "a"
+       print I0
+       reMatch RE, "\n"
+       print I0
+       end
+
+RE:
+       reAnything $fail
+       set I0, 1
+       reFinished
+$fail:
+       set I0, 0
+       reFinished
+CODE
+
+output_is(<<'CODE', "10", "character class works okay");
+       reMatch RE, "a"
+       print I0
+       reMatch RE, "z"
+       print I0
+       end
+
+RE:
+       reOneof "abc", $fail
+       set I0, 1
+       reFinished
+$fail:
+       set I0, 0
+       reFinished
+CODE

A serious stab at regexes

Reply via email to