Okay, this bunch of ops is a serious attempt at regular expressions. I had a discussion with japhy on this in the Monastery (http://www.perlmonks.org/index.pl?node_id=122784), and I've come up with something flexible enough to actually (maybe) work. Attached is a patch to modify core.ops and add re.h (defines data structures and such) and t/op/re.t (six tests). All tests, including the new ones, pass.
--Brent Dax [EMAIL PROTECTED] Configure pumpking for Perl 6 When I take action, I’m not going to fire a $2 million missile at a $10 empty tent and hit a camel in the butt. --Dubya
--- core.ops.old Sun Nov 4 03:07:31 2001 +++ core.ops Sun Nov 4 03:05:42 2001 @@ -2,6 +2,8 @@ ** core.ops */ +#include <parrot/re.h> + =head1 NAME core.ops @@ -1959,6 +1961,357 @@ ############################################################################### +=head2 Regular expression operations + +These operations are used by the regular expression engine. Unless +otherwise noted, any regexp opcode which takes an integer constant as its +last argument branches to that address if the op fails to match. + +=over 4 + +=cut + +######################################## + +=item B<reMatch>(ic, s) + +=item B<reMatch>(ic, sc) + +Sets the string in $2 as the string to match against and branches to the +regular expression at the address specified as $1. + +=item B<reMatch>(i, s) + +=item B<reMatch>(i, sc) + +Same as the ic variant, but jumps to $1 instead of branching. + +=cut + +AUTO_OP reMatch(ic, s|sc) { + cur_re=mem_sys_allocate(sizeof(re_info)); + + cur_re->string=$2; + cur_re->flags=0; + cur_re->index=0; + cur_re->minlength=0; + + /* + ** Allocate a stack + ** XXX There ought to be a function to do this, like + ** stack_make(interpreter, &cur_re->stack_base, &cur_re->stack_top) or +something + */ + cur_re->stack_base = mem_allocate_aligned(sizeof(struct StackChunk)); + cur_re->stack_top = &cur_re->stack_base->entry[0]; + cur_re->stack_base->used = 0; + cur_re->stack_base->free = STACK_CHUNK_DEPTH; + cur_re->stack_base->next = NULL; + cur_re->stack_base->prev = NULL; + + /* push the current location onto the call stack--we're doing the equivalent of a +sub call */ + push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + 3, +STACK_ENTRY_DESTINATION, NULL); + + RETREL($1); +} + +AUTO_OP reMatch(i, s|sc) { + cur_re=mem_sys_allocate(sizeof(re_info)); + + cur_re->string=$2; + cur_re->flags=0; + cur_re->index=0; + cur_re->minlength=0; + + /* + ** Allocate a stack + ** XXX There ought to be a function to do this, like + ** stack_make(interpreter, &cur_re->stack_base, &cur_re->stack_top) or +something + */ + cur_re->stack_base = mem_allocate_aligned(sizeof(struct StackChunk)); + cur_re->stack_top = &cur_re->stack_base->entry[0]; + cur_re->stack_base->used = 0; + cur_re->stack_base->free = STACK_CHUNK_DEPTH; + cur_re->stack_base->next = NULL; + cur_re->stack_base->prev = NULL; + + push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + 3, +STACK_ENTRY_DESTINATION, NULL); + + /* jump to the first argument */ + RETABS((opcode_t *)$1); +} + +######################################## + +=item B<reFlags>(s) + +=item B<reFlags>(sc) + +Sets the regular expression's flags. 'i' sets the flag +RE_case_insensitive_FLAG, 's' sets the flag +RE_single_line_FLAG, and 'm' sets the flag +RE_multiline_FLAG. Currently only 's' is implemented. + +=cut + +AUTO_OP reFlags(s|sc) { + int i; + char ch; + + for(i=0; i < string_length($1); i++) { + /* + ** XXX this is a REALLY naughty thing to do--I + ** shouldn't poke around inside the string like this + */ + ch=((char *)$1->bufstart)[i]; + + switch(ch) { + case 'i': + fprintf(stderr, "Warning: RE option /m not yet +implemented"); + RE_case_insensitive_SET(cur_re); break; + case 's': + RE_single_line_SET(cur_re); break; + case 'm': + fprintf(stderr, "Warning: RE option /m not yet +implemented"); + RE_multiline_SET(cur_re); break; + default: + fprintf(stderr, "Warning: unrecognized RE option /%c", +ch); + } + } +} + +######################################## + +=item B<reFlags>(s) + +=item B<reFlags>(sc) + +Sets the minimum number of characters that must be left in the +string for a match to be possible. For example, the expression +/fo*bar/ must have at least 4 characters in the string left to +match; the expression /fo+bar/ requires five characters. This +information is used to optimize calls to B<reAdvance>. + +=cut + +AUTO_OP reMinlength(i|ic) { + cur_re->minlength=$1; +} + +######################################## + +=item B<reLiteral>(s, ic) + +=item B<reLiteral>(sc, ic) + +Matches the string in $1 literally; in other words, if $1="bar", +this op will match the exact string "bar" and nothing else. +(This should be sensitive to RE_case_insensitive_FLAG but isn't +currently.) +=cut + +AUTO_OP reLiteral(s|sc, ic) { + STRING * arg=$1; + STRING * cmp=string_make(interpreter, "", 0, 0, 0, 0); + + + if(cur_re->index >= string_length(cur_re->string)) { + RETREL($2); + } + + string_substr( + interpreter, + cur_re->string, + cur_re->index, + string_length(arg), + &cmp + ); + + if(!string_compare(interpreter, arg, cmp)) { + cur_re->index += string_length(arg); + } + else { + RETREL($2); + } +} + +######################################## + +=item B<reOneof>(s, ic) + +=item B<reOneof>(sc, ic) + +Matches if the next character in the string being matched against +is in $1. (This should be sensitive to RE_case_insensitive_FLAG +but isn't currently.) + +=cut + +AUTO_OP reOneof(s|sc, i|ic) { + int i; + STRING * arg=$1; + STRING * matchagainst=string_make(interpreter, "", 0, 0, 0, 0); + STRING * nextchar=string_make(interpreter, "", 0, 0, 0, 0); + + if(cur_re->index >= string_length(cur_re->string)) { + RETREL($2); + } + + string_substr( + interpreter, + cur_re->string, + cur_re->index, + 1, + &matchagainst + ); + + for(i=0; i < string_length(arg); i++) { + string_substr( + interpreter, + arg, + i, + 1, + &nextchar + ); + + if(!string_compare(interpreter, matchagainst, nextchar)) { + cur_re->index++; + RETREL(*); + } + } + RETREL($2); +} + +######################################## + +=item B<reAnything>(ic) + +This behaves the same as '.' in a regular expression; if +RE_single_line_FLAG is set, it matches any character, otherwise +it matches only newline. + +=cut + +AUTO_OP reAnything(ic) { + STRING * newline=string_make(interpreter, "\n", strlen("\n"), 0, 0, 0); + STRING * cmp=string_make(interpreter, "", 0, 0, 0, 0); + + if(cur_re->index >= string_length(cur_re->string)) { + RETREL($1); + } + + string_substr( + interpreter, + cur_re->string, + cur_re->index, + 1, + &cmp + ); + + if(RE_single_line_TEST(cur_re) || string_compare(interpreter, newline, cmp)) { + cur_re->index++; + } + else { + RETREL($1); + } +} + +######################################## + +=item B<reAdvance>(ic) + +This op skips forward one character in the string; it is used to +walk forward through the string. For example, in the expression +C<"afoobarz" =~ /fo*bar/>, the 'f' doesn't immediately match the +'a', so it skips forward a letter; this op provides that behavior. + +=cut + +AUTO_OP reAdvance(ic) { + cur_re->index++; + + if(cur_re->index+cur_re->minlength >= string_length(cur_re->string)) { + RETREL($1); + } +} + +######################################## + +=item B<rePushindex>() + +This op pushes the current index onto the regular expression's stack. It is +used to remember indexes to backtrack to later. + +=cut + +AUTO_OP rePushindex() { + push_generic_entry( + interpreter, + &cur_re->stack_top, + &cur_re->index, + STACK_ENTRY_INT, + NULL + ); +} + +######################################## + +=item B<rePopindex>() + +=item B<rePopindex>(ic) + +This op pops the regular expression's stack and sets the regular expression's +index to the popped value. It is used primarily to backtrack. + +=cut + +AUTO_OP rePopindex() { + if(stack_depth(interpreter, cur_re->stack_base)) { + pop_generic_entry( + interpreter, + &cur_re->stack_top, + &cur_re->index, + STACK_ENTRY_INT + ); + } +} + +AUTO_OP rePopindex(ic) { + if(stack_depth(interpreter, cur_re->stack_base)) { + pop_generic_entry( + interpreter, + &cur_re->stack_top, + &cur_re->index, + STACK_ENTRY_INT + ); + } + else { + RETREL($1); + } +} + +######################################## + +=item B<reFinished>() + +This op does some cleanup so various data structures used by the regular +expression engine will be garbage collected and jumps back to the next op +after the reMatch that entered the regular expression. + +=cut + + +AUTO_OP reFinished() { + opcode_t *dest; + + (void *)cur_re->stack_top=(void *)cur_re->stack_base=NULL; /* so it'll be +GCed */ + cur_re=NULL; + /* ditto */ + + pop_generic_entry(interpreter, &interpreter->control_stack_top, &dest, +STACK_ENTRY_DESTINATION); + RETABS(dest); +} + +############################################################################### + =head1 COPYRIGHT Copyright (C) 2001 Yet Another Society. All rights reserved. --- /dev/null Wed Dec 31 16:00:00 1969 +++ include\parrot\re.h Sat Nov 3 23:31:50 2001 @@ -0,0 +1,94 @@ +#if !defined(PARROT_RE_H_GUARD) +#define PARROT_RE_H_GUARD + +#include <parrot/parrot.h> +#include <parrot/string.h> +#include <parrot/stacks.h> + +/* +** I'm stashing notes on the regex implementation here. +** In Perl 5, the RE engine is built on a simple principle. A printout of the +** structure of the RE /fo*bar/ (obtained with -mre=debug) is below: +** +** 1: EXACT <f>(3) +** 3: STAR(6) +** 4: EXACT <o>(0) +** 6: EXACT <bar>(8) +** 8: END(0) +** +** The basic principle is that, if the operation matches, we jump to the number +** in the parenthesis. In this RE system, the exact opposite is true; we jump +** when we _fail_. +** +** +** RE: +** reFlags "" +** reMinlength 4 +** +** branch $start +** +** $advance: +** rePopindex +** reAdvance $fail +** $start: +** rePushindex +** reLiteral "f", $advance +** $findo: +** literal "o", $findbar +** rePushindex +** branch $findo +** $findbar: +** reLiteral "bar", $backtrack +** set I0, 1 #true +** reFinished +** $backtrack: +** rePopindex $advance +** branch $findbar +** $fail: +** set I0, 0 #false +** reFinished +** +** Although this looks like more code than the compact setup above, we're using +** normal opcodes, so we have to be explicit. There's no STAR op; the branch +** implicitly does that, and the rePushindex helps it out. Nothing is 'nested' +** within something else--backtracking has to be explicit, and the pushing/popping +** must also be explicit. Even the behavior of starting at the next spot in the +** string has to be explicitly laid out. THESE ARE ALL SIDE EFFECTS OF THE FACT +** WE'RE USING NORMAL OPCODES. Further, these opcodes have very little communication +** between them--a small structure (defined later on in this file) contains just the +** string we're matching against, the index we're at right now, some metadata about +** the RE, and a stack. The size of the program isn't because we're jumping on +** false instead of true. The only place that isn't a win is alternation. +** +** Okay, I'm done rambling. Back to the code... +*/ + +typedef struct re_info { + STRING * string; + INTVAL index; + INTVAL flags; + INTVAL minlength; + + /* stack stuff */ + struct StackChunk * stack_base; + struct Stack_Entry * stack_top; +} re_info; + +re_info * cur_re; + +#define RE_case_insensitive_FLAG 0x1 +#define RE_case_insensitive_TEST(info) info->flags & +RE_case_insensitive_FLAG +#define RE_case_insensitive_SET(info) info->flags |= +RE_case_insensitive_FLAG +#define RE_case_insensitive_CLEAR(info) info->flags &= +~RE_case_insensitive_FLAG + +#define RE_single_line_FLAG 0x2 +#define RE_single_line_TEST(info) info->flags & +RE_single_line_FLAG +#define RE_single_line_SET(info) info->flags |= +RE_single_line_FLAG +#define RE_single_line_CLEAR(info) info->flags &= +~RE_single_line_FLAG + +#define RE_multiline_FLAG 0x4 +#define RE_multiline_TEST(info) info->flags & +RE_multiline_FLAG +#define RE_multiline_SET(info) info->flags |= +RE_multiline_FLAG +#define RE_multiline_CLEAR(info) info->flags &= +~RE_multiline_FLAG + +#endif --- /dev/null Wed Dec 31 16:00:00 1969 +++ t\op\re.t Sat Nov 3 17:53:26 2001 @@ -0,0 +1,105 @@ +#!perl -w + +use Parrot::Test tests => 6; + +output_is(<<'CODE', "1", "A is A"); + reMatch RE, "A" + print I0 + end + +RE: + reLiteral "A", $fail + set I0, 1 + reFinished +$fail: + set I0, 0 + reFinished + +CODE + +output_is(<<'CODE', 0, "A is not B"); + reMatch RE, "A" + print I0 + end + +RE: + reLiteral "B", $fail + set I0, 1 + branch $end +$fail: + set I0, 0 +$end: + reFinished +CODE + +output_is(<<'CODE', 1, "advance-on-start works okay"); + reMatch RE, "bab" + print I0 + end + +RE: + branch $start +$advance: + rePopindex + reAdvance $fail +$start: + rePushindex + reLiteral "a", $advance + set I0, 1 + reFinished +$fail: + set I0, 0 + reFinished +CODE + +output_is(<<'CODE', 0, "advance-on-start fails okay"); + reMatch RE, "bxb" + print I0 + end + +RE: + branch $start +$advance: + rePopindex + reAdvance $fail +$start: + rePushindex + reLiteral "a", $advance + set I0, 1 + reFinished +$fail: + set I0, 0 + reFinished +CODE + +output_is(<<'CODE', "10", "dot works okay"); + reMatch RE, "a" + print I0 + reMatch RE, "\n" + print I0 + end + +RE: + reAnything $fail + set I0, 1 + reFinished +$fail: + set I0, 0 + reFinished +CODE + +output_is(<<'CODE', "10", "character class works okay"); + reMatch RE, "a" + print I0 + reMatch RE, "z" + print I0 + end + +RE: + reOneof "abc", $fail + set I0, 1 + reFinished +$fail: + set I0, 0 + reFinished +CODE