I've been working independently on my own regexp setup.  The "patch" is
attached, except my diff program ran out of memory when I tried to do a
diff of core.ops.  Ouch.  So the stuff for core.ops is in something that
looks like a patch, but really isn't.  You'll have to set that up
manually.  The other two files this creates (t/op/re.t and
include/parrot/re.h) are real patches, however.

This guy implements REs with support for both "^" and "^"-less REs, '.',
literals, and '*?' (and '+?' by extension).  It comes with five tests,
and docs in the core.ops pseudo-patch.

Share and Enjoy,
--Brent Dax
[EMAIL PROTECTED]
Configure pumpking for Perl 6

When I take action, I’m not going to fire a $2 million missile at a $10
empty tent and hit a camel in the butt.
    --Dubya
+++core.ops 
 /*
 ** core.ops
 */

+#include <parrot/re.h>
+
 =head1 NAME
 
 core.ops

---

+###############################################################################
+
+=head2 Regular Expression
+
+The regular expression opcodes implement certain matching, backtracking and 
+other semantics, using the OPINFO structure defined in F<parrot/re.h>.
+
+Note that these ops are currently in a state of constant fluctuation and may change
+at any time.
+
+Use of the regular expression ops can be very confusing.  The following example
+may help--it implements C<"afoooobarz"=~/f.o*?bar/>, putting the result in I0:
+
+               match "afoooobarz", ""
+       RE_1:
+               goforward RE_END
+               literal "f"
+               anything
+       RE_2:
+               lazyrepeat
+               literal "o"
+               saveindex
+               literal "bar"
+               
+               backtrack RE_2
+               backtrack RE_1
+       END:
+               endre I0
+
+=over 4
+
+=cut
+
+
+########################################
+
+=item B<match>(s, s)
+
+=item B<match>(sc, s)
+
+=item B<match>(s, sc)
+
+=item B<match>(sc, sc)
+
+Sets $1 as the string to match against and $2 as the options
+on that string.
+
+=cut
+
+AUTO_OP match(s|sc, s|sc) {
+       cur_re=mem_sys_allocate(sizeof(REINFO));
+       cur_re->string=$1;
+       /* re_options_set($2); -- unimplemented */
+       cur_re->ok=1;
+       cur_re->reallyscrewed=0;
+       cur_re->index=0;
+       cur_re->startindex=0;
+       cur_re->savedindex=0;
+}
+
+########################################
+
+=item B<goforward>(ic)
+
+Implements the behavior of advancing to the next starting
+position if the current starting position fails.  $1 is the 
+label to branch to if a match is impossible at the current 
+position.
+
+=cut
+
+AUTO_OP goforward(ic) {
+       cur_re->index=cur_re->startindex++;
+               
+       if(cur_re->index >= string_length(cur_re->string)) {
+               cur_re->ok=0;
+               RETREL($1);
+       }
+       
+       cur_re->ok=1;
+}
+
+########################################
+
+=item B<literal>(s)
+
+=item B<literal>(sc)
+
+Matches the string $1 literally; $1 may be more than one 
+character.
+
+=cut
+
+AUTO_OP literal(s|sc) {
+       STRING * arg=$1;
+       STRING * cmp=string_make(interpreter, "", 0, 0, 0, 0);
+
+       if(cur_re->reallyscrewed || ! cur_re->ok) {RETREL(*);}
+       
+       if(string_length(cur_re->string) < cur_re->index + string_length(arg)) {
+               cur_re->ok=0;
+               RETREL(*);
+       }
+
+       string_substr(
+               interpreter, 
+               cur_re->string, 
+               cur_re->index, 
+               string_length(arg), 
+               &cmp
+       );
+       
+       if(string_compare(interpreter, cmp, arg)) {
+               cur_re->ok=0;
+       }
+       else {
+               cur_re->index += string_length(arg);
+       }
+}
+
+########################################
+
+=item B<anything>()
+
+Matches any single character; equivalent to '.' in a regexp.
+
+=cut
+
+AUTO_OP anything() {
+       if(cur_re->reallyscrewed || ! cur_re->ok) {RETREL(*);}
+       
+       /* XXX this shouldn't match line break, but it does */
+       if(string_length(cur_re->string) < ++cur_re->index) {
+               cur_re->ok=0;
+       }
+}
+
+########################################
+
+=item B<lazyrepeat>()
+
+Implements the behavior of *? (and, by extension, +?)--
+if we backtrack to this opcode, it will advance us past
+whatever was matched between this opcode and a B<saveindex>
+call.
+
+=cut
+
+AUTO_OP lazyrepeat() {
+       if(cur_re->reallyscrewed) {
+               cur_re->reallyscrewed++;
+       }
+       if(cur_re->reallyscrewed || ! cur_re->ok) {RETREL(*);}
+       if(cur_re->savedindex) {
+               cur_re->index=cur_re->savedindex;
+       }
+}
+
+########################################
+
+=item B<saveindex>()
+
+Remembers the current position so that B<lazyrepeat> can
+advance past it if we backtrack.  If something between 
+B<lazyrepeat> and this op failed, this failure is noted so that
+we don't try to backtrack to the B<lazyrepeat> again (which would
+throw us into an infinite loop).
+
+=cut
+
+AUTO_OP saveindex() {
+       if(!cur_re->reallyscrewed) {
+               if(cur_re->ok) {
+                       cur_re->savedindex=cur_re->index;
+               }
+               else {
+                       cur_re->reallyscrewed++;
+               }
+       }
+}
+
+########################################
+
+=item B<backtrack>(ic)
+
+Backtracks to the op at label $1 if the match failed (unless 
+B<lazyrepeat> told us it was pointless, in which case we clear
+that flag).
+
+=cut
+
+AUTO_OP backtrack(ic) {
+       if(cur_re->reallyscrewed) {
+               cur_re->reallyscrewed--;
+       }
+       else {
+               /* 
+                * if it hasn't been okay so far, we have to
+                * jump back to the thing that started the 
+                * repetition.
+                */
+               if(!cur_re->ok) {
+                       cur_re->ok=1;
+                       RETREL($1);
+               }
+       }
+}
+
+########################################
+
+=item B<endre>(i)
+
+Sets $1 to 1 if the match has been successful, 0 otherwise.
+
+=back
+
+=cut
+
+AUTO_OP endre(i) {
+       if(cur_re->reallyscrewed) {
+               $1=0;
+       }
+       else {
+               $1=cur_re->ok;
+       }
+}
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ include\parrot\re.h Fri Nov  2 01:43:10 2001
@@ -0,0 +1,28 @@
+#if !defined(PARROT_RE_H_GUARD)
+#define PARROT_RE_H_GUARD 1
+
+#include <parrot/parrot.h>
+#include <parrot/string.h>
+
+typedef struct re_info {
+       /* the string we're matching against */
+       STRING * string;
+       /* if the match has been successful or not */
+       INTVAL ok;
+       /* indicates how many backtracks should be
+        * skipped to avoid getting caught in an 
+        * infinite loop
+        */
+       INTVAL reallyscrewed;
+       /* the current index */
+       INTVAL index;
+       /* XXX these next two should be rolled up into a stack */
+       /* the index we should start at next time we hit goforward */
+       INTVAL startindex;
+       /* how far ahead 'lazyrepeat' should skip next time */
+       INTVAL savedindex;
+} REINFO;
+
+REINFO * cur_re;
+
+#endif
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ t\op\re.t   Fri Nov  2 02:22:44 2001
@@ -0,0 +1,60 @@
+#! perl -w
+
+use Parrot::Test tests => 5;
+
+output_is(<<'CODE', '1', "simple literal");
+match "a", ""
+literal "a"
+endre I0
+
+print I0
+CODE
+
+output_is(<<'CODE', '1', "literal w/o matching front");
+       match "ba", ""
+RE_1:
+       goforward END
+       literal "a"
+       backtrack RE_1
+END:
+       endre I0
+       print I0
+CODE
+
+output_is(<<'CODE', '0', "fails gracefully");  #3
+       match "bb", ""
+RE_1:
+       goforward END
+       literal "a"
+       backtrack RE_1
+END:
+       endre I0
+       print I0
+CODE
+
+output_is(<<'CODE', '1', ". works");   #4
+       match "xabc", ""
+RE_1:
+       goforward END
+       anything
+       anything
+       literal "c"
+       backtrack RE_1
+END:
+       endre I0
+       print I0
+CODE
+
+output_is(<<'CODE', '1', "?* works");
+       match "foobar", ""
+       literal "f"
+RE_0:
+       lazyrepeat
+       literal "o"
+       saveindex
+       literal "bar"
+       backtrack RE_0
+       
+       endre I0
+       print I0
+CODE

Reply via email to