Okay, here it is. Attached is the regular expression patch. It currently segfaults on Windows because of a combination of two factors:
1) There are some bounds-checking issues in key.c 2) Windows's malloc() isn't as robust as Unix's This is only a problem on native Windows, not on Cygwin; I've confirmed this myself. Besides what you'd expect it to do, this patch makes a (very) minor change to string.c. Basically, it makes it so that you can take a zero-length substring with an index equal to the size of the string. This is done so that the equivalent to "<$`><$&><$'>" doesn't need a special case in the bytecode when $& reaches all the way to the end of the string. Copious documentation is included in rx.ops, and twenty tests are included in t/op/rx.t. This patch is reliant on the ParrotPointer patch I sent in earlier. I can't quite guarantee that the patch will apply cleanly--I had to manually change some things in it--but the things that may not apply well should be easy to put in manually. UNIMPLEMENTED OPCODES: rx_compile - compile a regex rx_cloneinfo - clone the info structure (used for look(ahead|behind)s) UNTESTED OPCODES: rx_forwards - tell the regex to increment the current index when moving rx_backwards - tell the regex to decrement the current index when moving Share and enjoy. --Brent Dax [EMAIL PROTECTED] Configure pumpking for Perl 6 <obra> mmmm. hawt sysadmin chx0rs <lathos> This is sad. I know of *a* hawt sysamin chx0r. <obra> I know more than a few. <lathos> obra: There are two? Are you sure it's not the same one?
--- parrot-cvs/Makefile.in Wed Jan 9 02:51:00 2002 +++ parrot/Makefile.in Wed Jan 9 02:50:24 2002 @@ -63,7 +63,7 @@ $(INC)/global_setup.h $(INC)/vtable.h $(INC)/oplib/core_ops.h $(INC)/oplib/core_ops_prederef.h \ $(INC)/runops_cores.h $(INC)/trace.h \ $(INC)/pmc.h $(INC)/key.h $(INC)/resources.h $(INC)/platform.h \ -$(INC)/interp_guts.h ${jit_h} ${jit_struct_h} +$(INC)/interp_guts.h ${jit_h} ${jit_struct_h} $(INC)/rx.h CLASS_O_FILES = classes/default$(O) classes/perlint$(O) classes/perlstring$(O) \ classes/perlnum$(O) classes/perlarray$(O) classes/perlundef$(O) \ @@ -79,7 +79,7 @@ INTERP_O_FILES = global_setup$(O) interpreter$(O) parrot$(O) register$(O) \ core_ops$(O) core_ops_prederef$(O) memory$(O) packfile$(O) stacks$(O) \ string$(O) encoding$(O) chartype$(O) runops_cores$(O) trace$(O) pmc$(O) key$(O) \ -platform$(O) ${jit_o} resources$(O) +platform$(O) ${jit_o} resources$(O) rx$(O) O_FILES = $(INTERP_O_FILES) $(IO_O_FILES) $(CLASS_O_FILES) $(ENCODING_O_FILES) $(CHARTYPE_O_FILES) @@ -292,6 +292,8 @@ register$(O): $(H_FILES) +rx$(O): $(H_FILES) + stacks$(O): $(H_FILES) core_ops$(O): $(H_FILES) core_ops.c @@ -396,4 +398,3 @@ lint: ${test_prog} $(LINT) ${cc_inc} -Iclasses $(LINTFLAGS) `echo $(O_FILES) | sed 's/\.o/\.c/g'` $(LINT) ${cc_inc} $(LINTFLAGS) test_main.c - --- parrot-cvs/MANIFEST Wed Jan 9 02:25:54 2002 +++ parrot/MANIFEST Tue Jan 8 16:32:58 2002 @@ -109,6 +110,7 @@ include/parrot/pmc.h include/parrot/register.h include/parrot/resources.h +include/parrot/rx.h include/parrot/runops_cores.h include/parrot/stacks.h include/parrot/string.h @@ -185,6 +187,8 @@ pmc_pm.pl register.c resources.c +rx.c +rx.ops runops_cores.c stacks.c string.c @@ -198,8 +202,8 @@ t/op/macro.t t/op/number.t t/op/pmc.t t/op/pmc_perlhash.t t/op/pmc_perlstring.t +t/op/rx.t t/op/stacks.t t/op/string.t t/op/time.t --- /dev/null Wed Jan 9 03:01:05 2002 +++ /parrot/include/parrot/rx.h Tue Jan 8 15:20:58 2002 @@ -0,0 +1,92 @@ +/* rx.h + * Copyright: (When this is determined...it will go here) + * CVS Info + * $Id$ + * Overview: + * Supporting file for the regular expression engine + * Data Structure and Algorithms: + * rxinfo is the main structure involved in regular expressions; it's stuffed + * into a Handle PMC and passed to all regular expression opcodes. + * History: + * Notes: + * References: + */ + +#if !defined(PARROT_RX_H_GUARD) +#define PARROT_RX_H_GUARD + +#include "parrot/parrot.h" + +typedef enum rxflags { + enum_rxflags_none=0, + enum_rxflags_case_insensitive=1, + enum_rxflags_single_line=2, + enum_rxflags_multiline=4, + enum_rxflags_reverse=8 +} rxflags; + +typedef enum rxdirection { + enum_rxdirection_forwards=1, + enum_rxdirection_backwards=-1 +} rxdirection; + +extern const INTVAL RX_MARK; + +typedef struct rxinfo { + STRING *string; + INTVAL index; + INTVAL startindex; + BOOLVAL success; + + rxflags flags; + INTVAL minlength; + rxdirection whichway; + + PMC *groupstart; + PMC *groupend; + + opcode_t *substfunc; + + struct Stack_Entry *stack_top; + struct StackChunk *stack_base; +} rxinfo; + +rxinfo * rx_allocate_info(struct Parrot_Interp *, STRING *); + +BOOLVAL rx_is_word_character(char ch); +BOOLVAL rx_is_number_character(char ch); +BOOLVAL rx_is_whitespace_character(char ch); + +STRING *rxP_get_substr(struct Parrot_Interp *, STRING *, INTVAL, INTVAL); + +#define RX_dUNPACK(pmc) rxinfo *rx=(rxinfo *)pmc->data +/* this one is really quite evil */ +#define RxCurChar(rx) ((char +*)rx->string->bufstart)[rx->index] +#define RxCurCharS(rx) rxP_get_substr(interpreter, +rx->string, rx->index, 1) + +#define RxAdvance(rx) RxAdvanceX(rx, 1) +#define RxAdvanceX(rx, x) rx->index += x * rx->whichway + +#define RxCaseInsensitive_on(rx) RxFlagOn(rx, enum_rxflags_case_insensitive) +#define RxCaseInsensitive_off(rx) RxFlagOff(rx, enum_rxflags_case_insensitive) +#define RxCaseInsensitive_test(rx) RxFlagTest(rx, enum_rxflags_case_insensitive) + +#define RxSingleLine_on(rx) RxFlagOn(rx, enum_rxflags_single_line) +#define RxSingleLine_off(rx) RxFlagOff(rx, enum_rxflags_single_line) +#define RxSingleLine_test(rx) RxFlagTest(rx, enum_rxflags_single_line) + +#define RxMultiline_on(rx) RxFlagOn(rx, enum_rxflags_multiline) +#define RxMultiline_off(rx) RxFlagOff(rx, enum_rxflags_multiline) +#define RxMultiline_test(rx) RxFlagTest(rx, enum_rxflags_multiline) + +#define RxReverse_on(rx) RxFlagOn(rx, enum_rxflags_reverse) +#define RxReverse_off(rx) RxFlagOff(rx, enum_rxflags_reverse) +#define RxReverse_test(rx) RxFlagTest(rx, enum_rxflags_reverse) + +#define RxFlagOn(rx, flag) (rx->flags |= flag) +#define RxFlagOff(rx, flag) (rx->flags &= ~flag) +#define RxFlagTest(rx, flag) (rx->flags & flag) + +#define RxFlagsOff(rx) rx->flags = enum_rxflags_none + +#endif \ No newline at end of file --- /dev/null Wed Jan 9 03:01:06 2002 +++ /parrot/rx.ops Tue Jan 8 16:16:54 2002 @@ -0,0 +1,1269 @@ +/* +** rx.ops +*/ + +#include "parrot/rx.h" + +#define RxAssertMore(rx, branchto) if(rx->index >= string_length(rx->string)) { goto +OFFSET(branchto); } + +VERSION = PARROT_VERSION; + +=head1 NAME + +rx.ops - Parrot Regular Expression Engine, version 3.0 + +=head1 SYNOPSIS + + # NOTE: This looks a LOT scarier than it really is + # "zzabbBBBBBBcdcdcdzz" =~ /ab*[cd]+/i + rx_allocateinfo P0, "zzabbBBBBBBcdcdcdzz" + bsr RX_0 + rx_info_successful P0, I0 + rx_freeinfo P0 + if I0, $match + print "no " + $match: + print "match" + end + + RX_0: + rx_setprops P0, "i", 2 + branch $start0 + $advance: + rx_advance P0, $fail + $start0: + rx_literal P0, "a", $advance + + $start1: + rx_pushmark P0 + $top1: + rx_literal P0, "b", $start2 + rx_pushindex P0 + branch $top1 + $back1: + rx_popindex P0, $advance + + $start2: + rx_literal P0, "cd", $back1 + $top2: + rx_oneof P0, "cd", $succeed + branch $top2 + + $succeed: + rx_succeed P0 + ret + $fail: + rx_fail P0 + ret + + +=head1 DESCRIPTION + +The Perl 5 regular expression engine was state-of-the-art. It was the fastest and +most featureful implementation available. Everybody used Perl 5's regular expression +syntax wherever possible. + +The Perl 5 regular expression engine was also a mess. + +The engine was like a separate interpreter unto itself. Few understood its dark +magic, +and fewer worked on its baroque source. It was a black box, sealed off from the +outside +world with only a couple opcodes to show in other files. It was the slowest part of +Perl +to adapt to new features--it was one of the last to get threadsafety and full Unicode +support--because so few people understood it. Larry Wall once said that three people +understood the regex engine, give or take four. + +Because of these issues, the design documents for Parrot called for regular +expression +opcodes to be built in to the interpreter. This group of opcodes, called the Parrot +Regular Expression Engine version 3.0 (or simply Rx3), is the result. + +=head2 Basic Concepts + +Perl 5 had one opcode for each operation in the regular expression. For example: + + >perl -mre=debug -e '/ab+[cd]/' + Compiling REx `ab+[cd]' + size 15 first at 1 + 1: EXACT <a>(3) + 3: PLUS(6) + 4: EXACT <b>(0) + 6: ANYOF[cd](15) + 15: END(0) + anchored `ab' at 0 floating `b' at 1..2147483647 (checking anchored) minlen 3 + Freeing REx: `ab+[cd]' + +(The C<re> pragma with the 'debug' switch displays the compiled version of the regex. +The numbers in parenthesis represent where to jump to on success; 0 is a +special value meaning "this part of the regex is done".) + +In Rx3, that regular expression would be something like: + + $advance: + rx_advance P0, $fail + $start: + rx_literal P0, "ab", $advance + rx_pushmark P0 + $top: + rx_pushindex P0 + rx_literal P0, "b", $next + branch $top + $backtrack: + rx_popindex P0, $advance + $next: + rx_oneof P0, "cd", $backtrack + branch $success + +(In Rx3, the last parameter is a label to branch to on I<failure>, not success.) + +If you were insane enough to convert the labels to offsets, you'd get something like: + + rx_advance P0, $fail + $start: + rx_literal P0, "ab", -6 + rx_pushmark P0 + rx_pushindex P0 + rx_literal P0, "b", 6 + branch -7 + rx_popindex P0, -19 + rx_oneof P0, "cd", -6 + branch $success + +9 operations in Rx3 to 5 in Perl 5. I can already hear the cynicism: "how could +that be BETTER?!?" Well, there's several reasons. + +The first is that it frees us to use normal ops, and in fact they're used all the +time. C<branch> is a normal op; so is C<bsr>, the normal way to call a regular +expression. Things like C<(?{CODE})> can be implemented with relative ease--simply +put the normal opcodes in the appropriate place in the regex. If you're debugging +a regex, you can simply sprinkle output messages liberally throughout the regex. + +The second is opcode dispatch. Parrot has very fast opcode dispatch, and we can use +that to our advantage. + +Finally, there's the matter of optimizations. As an example, take C</a+bc+/>. The +most efficient way to look for that is probably to look for the constant string 'abc' +and expand outwards from there--especially if you use Boyer-Moore or another fast +search algorithm. It means that the code generator can decide whether to optimize +for success or failure, for compilation or execution speed. You get the idea. + +Bottom line is, Rx3 lays out exactly what's going on. This is a feature. It gives +the +regex compiler total control over what's going on. + +=head2 The Opcodes + +There are two basic rules to how the opcodes operate. + +The first involves the PMC that most take as their first parameter. This is a handle +for +an 'info' structure. The info structure accumulates data as it churns through the +regex, +such as the start and current indices and the start and end of each group. It also +keeps +track of things like the string we're matching against. + +The second rule pertains to the ops that have an integer constant as their last +parameter. +For the most part, these ops will branch to that parameter if the 'fail'. For most +ops, +'fail' means 'fail to match'. + +If the documentation for an op doesn't specifically mention the first or last +parameter, +that's what they are. + +The documentation for each opcode follows. + +=cut + +############################################################################### + +=head3 Preparation + +=over 4 + +=cut + +######################################## + +=item C<rx_allocateinfo>(p, p|s|sc) + +Allocates a new info structure and puts it into the first parameter. The second +parameter +is the string to match against. + +=cut + +op rx_allocinfo(out pmc, in str) { + rxinfo *rx=rx_allocate_info(interpreter, $2); + + $1=pmc_new(interpreter, enum_class_ParrotPointer); + + $1->data=(void*)rx; + + goto NEXT(); +} + +op rx_allocinfo(out pmc, in pmc) { + rxinfo *rx=rx_allocate_info(interpreter, $2->vtable->get_string(interpreter, +$2)); + + $1=pmc_new(interpreter, enum_class_ParrotPointer); + + $1->data=(void*)rx; + + goto NEXT(); +} + +######################################## + +=item C<rx_freeinfo>(p) + +Deallocates the info structure in the first parameter and nulls out the handle. + +=cut + +op rx_freeinfo(inout pmc) { + mem_sys_free($1->data); + $1->data=NULL; + + goto NEXT(); +} + +######################################## + +=item C<rx_cloneinfo>(p) + +Clones the info structure in the first parameter. Make sure to save the original +structure in another register, the stack, or a symbol table entry before calling this +opcode. + +B<XXX> Currently this op has not been implemented. + +=cut + +op rx_cloneinfo(inout pmc) { + RX_dUNPACK($1); + + goto NEXT(); +} + +######################################## + +=item C<rx_compile>(i, s|sc, s|sc) + +Provides a built-in regular expression compiler. The first parameter is set to the +address of the newly-compiled regex, which can then be C<jsr>'ed to; the second +parameter is the regex itself; and the third parameter is the modifiers on the regex. + +B<XXX> Currently this op has not been implemented. + +=cut + +op rx_compile(out str, in str, in str) { + $1=0; + + goto NEXT(); +} + +############################################################################### + +=back + +=head3 Info accessor ops + +=over 4 + +=cut + +######################################## + +=item C<rx_info_successful>(p, i) + +If the info structure indicates the match was successful, sets the second parameter +to true; otherwise sets it to false. + +=cut + +op rx_info_successful(in pmc, out int) { + RX_dUNPACK($1); + + $2=rx->success; + + goto NEXT(); +} + +######################################## + +=item C<rx_info_getindex>(p, i) + +Retrieves the current index stored in the info structure. If the match has already +finished successfully, this will be the index of the end of the match. + +=cut + +op rx_info_getindex(in pmc, out int) { + RX_dUNPACK($1); + + $2=rx->index; + + goto NEXT(); +} + +######################################## + +=item C<rx_info_getstartindex>(p, i) + +Gets the index the match started at. + +Note that if a regex uses the C<rx_backwards(p)> op, the start and end indices may be +reversed. + +=cut + +op rx_info_getstartindex(in pmc, out int) { + RX_dUNPACK($1); + + $2=rx->startindex; + + goto NEXT(); +} + +######################################## + +=item C<rx_info_getgroup>(in pmc, out int, out int, in int) + +Gets the start and end indices of the group indicated by the fourth parameter. + +=cut + +op rx_info_getgroup(in pmc, out int, out int, in int) { + RX_dUNPACK($1); + + $2=rx->groupstart->vtable->get_integer_index(interpreter, rx->groupstart, $4); + $3=rx->groupend->vtable->get_integer_index(interpreter, rx->groupend, $4); + + goto NEXT(); +} + +############################################################################### + +=back + +=head3 Stack manipulation ops + +=over 4 + +=cut + +######################################## + +=item C<rx_pushindex>(p) + +Pushes the current index onto the stack contained in the info structure. + +=cut + +op rx_pushindex(in pmc) { + RX_dUNPACK($1); + + push_generic_entry(interpreter, &rx->stack_top, &rx->index, STACK_ENTRY_INT, +NULL); + + goto NEXT(); +} + +######################################## + +=item C<rx_pushmark>(p) + +Pushes a 'mark' onto the stack contained in the info structure. Marks are used +to indicate where one operation's backtrack information ends and another's begins. + +=cut + +op rx_pushmark(in pmc) { + RX_dUNPACK($1); + + /* Don't worry about the const warning from the next line */ + push_generic_entry(interpreter, &rx->stack_top, &RX_MARK, STACK_ENTRY_INT, +NULL); + + goto NEXT(); +} + +######################################## + +=item C<rx_popindex>(p, ic) + +Pops an index off the stack. If it pops a mark off instead, it branches to the +second parameter. + +=cut + +op rx_popindex(in pmc, in int) { + RX_dUNPACK($1); + int i; + + pop_generic_entry(interpreter, &rx->stack_top, &i, STACK_ENTRY_INT); + + if(i==RX_MARK) { + goto OFFSET($2); + } + else { + rx->index=i; + goto NEXT(); + } +} + +############################################################################### + +=back + +=head3 Directional ops + +=over 4 + +=cut + +######################################## + +=item C<rx_forwards>(p) + +Indicates that the regex should increment the index as it moves through the string. + +=cut + +op rx_forwards(in pmc) { + RX_dUNPACK($1); + + rx->whichway=enum_rxdirection_forwards; + + goto NEXT(); +} + + +######################################## + +=item C<rx_backwards>(p) + +Indicates that the regex should decrement the index as it moves through the string. +This is different from reversed regexes (see L</"rx_setprops(p, sc, ic)">); reversed +affects the start index, while backwards affects the end index. + +=cut + +op rx_backwards(in pmc) { + RX_dUNPACK($1); + + rx->whichway=enum_rxdirection_backwards; + + goto NEXT(); +} + +############################################################################### + +=back + +=head3 Matching ops + +=over 4 + +=cut + +######################################## + +=item C<rx_advance>(p, ic) + +Increments (or decrements, if the C<r> modifier is used) the start index one +character. Branches to the second parameter if it goes past the end of the string. + +=cut + +op rx_advance(in pmc, in int) { + RX_dUNPACK($1); + + if(!RxReverse_test(rx)) { + if(++rx->startindex + rx->minlength > string_length(rx->string)) { + goto OFFSET($2); + } + } + else { + if(--rx->startindex < 0) { + goto OFFSET($2); + } + } + + rx->index=rx->startindex; + + while(stack_depth(interpreter, rx->stack_base)) { + pop_generic_entry(interpreter, &rx->stack_top, NULL, STACK_ENTRY_INT); + } + + goto NEXT(); +} + +######################################## + +=item C<rx_incrindex>(p, ic) + +Increments the current index (or decrements, if C<rx_backwards> is used) by the +amount in the second parameter. Does I<not> check if it's gone past the end of the +string. + +=cut + +op rx_incrindex(in pmc, in int) { + RX_dUNPACK($1); + RxAdvanceX(rx, $2); + + goto NEXT(); +} + +######################################## + +=item C<rx_setprops>(p, sc, ic) + +Sets certain properties in the info structure. The second parameter is a string +containing one or more of the following characters: + +=over 4 + +=item C<i> + +Sets case-insensitive matching. + +=item C<s> + +Sets single-line matching; the C<rx_dot> op will match newlines with this turned on. + +=item C<m> + +Sets multiline matching; the C<rx_zwa_atbeginning> and C<rx_zwa_atend> opcodes will +match the beginning and end of lines. + +=item C<r> + +Sets reverse or right matching; match starts at the end of the string and inches +towards the beginning. + +=back + +The third parameter is the minimum length the string would need to be for a match to +be possible. For example, in the match C</ba*r+/>, the minimum length is 2. + +=cut + +op rx_setprops(in pmc, in str, in int) { + int i; + RX_dUNPACK($1); + + rx->minlength=$3; + + for(i=0; i < string_length($2); i++) { + switch(((char *)$2->bufstart)[i]) { + case 'i': + RxCaseInsensitive_on(rx); + break; + case 's': + RxSingleLine_on(rx); + break; + case 'm': + RxMultiline_on(rx); + break; + case 'r': + RxReverse_on(rx); + rx->index=rx->startindex=string_length(rx->string); + break; + default: + fprintf(stderr, "Unknown regular expression option +'%c'.", ((char*)$2->bufstart)[i]); + HALT(); + } + } + + goto NEXT(); +} + +######################################## + +=item C<rx_startgroup>(p, ic) + +Indicates that the current index is the start index of the group number indicated in +the second parameter. + +=cut + +op rx_startgroup(in pmc, in int) { + RX_dUNPACK($1); + + rx->groupstart->vtable->set_integer_index(interpreter, rx->groupstart, +rx->index, $2); + + goto NEXT(); +} + +######################################## + +=item C<rx_endgroup>(p, ic) + +Indicates that the current index is the end index of the group number indicated in +the second parameter. + +=cut + +op rx_endgroup(in pmc, in int) { + RX_dUNPACK($1); + + rx->groupend->vtable->set_integer_index(interpreter, rx->groupend, rx->index, +$2); + + goto NEXT(); +} + +######################################## + +=item C<rx_literal>(p, in str, ic) + +Matches the exact string (sensitive to the C<i> modifier) passed in the second +parameter. + +B<XXX> Currently does not honor the C<i> modifier. + +=cut + +op rx_literal(in pmc, in str, in int) { + RX_dUNPACK($1); + STRING *targ; + + if(string_length(rx->string) < rx->index+string_length($2)) { + goto OFFSET($3); + } + + targ=rxP_get_substr(interpreter, rx->string, rx->index, string_length($2)); + + if(string_compare(interpreter, $2, targ)==0) { + RxAdvanceX(rx, string_length($2)); + } + else { + goto OFFSET($3); + } + + goto NEXT(); +} + +######################################## + +=item C<rx_is_w>(p, ic) + +Matches a word character (usually C<\w>). + +=cut + +op rx_is_w(in pmc, in int) { + RX_dUNPACK($1); + + RxAssertMore(rx, $2); + + if(rx_is_word_character(RxCurChar(rx))) { + RxAdvance(rx); + goto NEXT(); + } + else { + goto OFFSET($2); + } +} + +######################################## + +=item C<rx_is_n>(p, ic) + +Matches a number character (usually C<\n>). + +=cut + +op rx_is_n(in pmc, in int) { + RX_dUNPACK($1); + + RxAssertMore(rx, $2); + + if(rx_is_number_character(RxCurChar(rx))) { + RxAdvance(rx); + goto NEXT(); + } + else { + goto OFFSET($2); + } +} + +######################################## + +=item C<rx_is_s>(p, ic) + +Matches a whitespace character (usually C<\s>). + +=cut + +op rx_is_s(in pmc, in int) { + RX_dUNPACK($1); + + RxAssertMore(rx, $2); + + if(rx_is_whitespace_character(RxCurChar(rx))) { + RxAdvance(rx); + goto NEXT(); + } + else { + goto OFFSET($2); + } + +} + +######################################## + +=item C<rx_oneof>(p, sc, ic) + +Matches if the current character is one of the characters in the second parameter. +Sensitive to the C<i> modifier. + +This op requires that its input be sorted for efficiency. Further, it requires that +all +ranges (C<a-z>) be expanded by the regex compiler. + +B<XXX> Currently does not honor the C<i> modifier. + +=cut + +op rx_oneof(in pmc, in str, in int) { + RX_dUNPACK($1); + STRING *ch1; + STRING *ch2; + INTVAL i; + + /* XXX In the future, this ought to use bitmaps. */ + + RxAssertMore(rx, $3); + + ch1=RxCurCharS(rx); + + if(string_length($2) < 8) { /* XXX run benchmarks to find a good value */ + /* modified linear search--slow, but zero overhead */ + for(i=0; i < string_length($2); i++) { + ch2=rxP_get_substr(interpreter, $2, i, 1); + + if(string_compare(interpreter, ch1, ch2)==0) { + RxAdvance(rx); + goto NEXT(); + } + else if(string_compare(interpreter, ch1, ch2) < 0) { + goto OFFSET($3); + } + } + } + else { + /* binary search--fast but complicated */ + INTVAL upper, lower=0, index=0, lastindex=-1, cmp; + + upper=string_length($2); + + while(upper > lower) { + index=(upper+lower)/2; + + if(index==lastindex) { + goto OFFSET($3); + } + else if(index==string_length($2)) { + goto OFFSET($3); + } + + cmp=string_compare(interpreter, RxCurCharS(rx), +rxP_get_substr(interpreter, $2, index, 1)); + + if(0==cmp) { + RxAdvance(rx); + goto NEXT(); + } + else if(0 > cmp) { + upper=index; + } + else { + lower=index; + } + + lastindex=index; + } + } + + goto OFFSET($3); +} + +######################################## + +=item C<rx_dot>(p, ic) + +Matches any character except a newline (C<\n>). (If the C<s> modifier is used, +matches any character at all.) + +=cut + +op rx_dot(in pmc, in int) { + RX_dUNPACK($1); + + RxAssertMore(rx, $2); + + if(RxSingleLine_test(rx)) { + RxAdvance(rx); + goto NEXT(); + } + else { + STRING *ch=RxCurCharS(rx); + STRING *nl=string_make(interpreter, "\n", 1, 0, 0, 0); + + if(string_compare(interpreter, ch, nl)!=0) { + RxAdvance(rx); + goto NEXT(); + } + else { + goto OFFSET($2); + } + } +} + +######################################## + +=item C<rx_zwa_boundary>(p, ic) + +Matches if the one of the previous character and the next character is a word +character, and the other one is not (usually C<\b>). + +=cut + +op rx_zwa_boundary(in pmc, in int) { + RX_dUNPACK($1); + char ch1, ch2; + + ch1=RxCurChar(rx); + RxAdvanceX(rx, -1); + ch2=RxCurChar(rx); + RxAdvance(rx); + + if(rx_is_word_character(ch1) == rx_is_word_character(ch2)) { + goto OFFSET($2); + } + + goto NEXT(); +} + +######################################## + +=item C<rx_zwa_atbeginning>(p, ic) + +Matches at the beginning of the string. If the C<m> modifier is used, matches at the +beginning of any line. + +B<XXX> Currently does not honor the C<m> modifier. + +=cut + +op rx_zwa_atbeginning(in pmc, in int) { + RX_dUNPACK($1); + + if(rx->index != 0) { + goto OFFSET($2); + } + + goto NEXT(); +} + +######################################## + +=item C<rx_zwa_atend>(p, ic) + +Matches at the end of the string. If the C<m> modifier is used, matches at the +end of any line. + +B<XXX> Currently does not honor the C<m> modifier. + +=cut + +op rx_zwa_atend(in pmc, in int) { + RX_dUNPACK($1); + + if(rx->index != string_length(rx->string)) { + goto OFFSET($2); + } + + goto NEXT(); +} + +######################################## + +=item C<rx_succeed>(p) + +Modifies the info structure to indicate that the match succeeded. + +=cut + +op rx_succeed(in pmc) { + RX_dUNPACK($1); + + rx->success=1; + + goto NEXT(); +} + +######################################## + +=item C<rx_fail>(p) + +Modifies the info structure to indicate that the match failed. + +=cut + +op rx_fail(in pmc) { + RX_dUNPACK($1); + + rx->success=0; + + goto NEXT(); +} + +############################################################################### + +=back + +=head2 Using the opcodes + +=head3 Tutorial + +Now that you've seen all the opcodes available, you'll probably want to know how to +use +them. + +To do so, we'll walk you through the building of a regular expression. For this +example, +we'll use the expression C</ab*[cd]+/i>. (This is the same expression written out in +L</SYNOPSIS>.) + +The first step is to break it up into term-quantifier pairs. In this case: + + RX_0: + a + b* + [cd]+ + + +Next, we'll figure out how to match each term. + + RX_0: + rx_literal P0, "a", ... + rx_literal P0, "b", ... + rx_oneof P0, "cd", ... + +The elipses will be filled in later. + +Now, we need to figure out how to represent the quantifiers. We end up with +something +like this: + + RX_0: + rx_literal P0, "a", ... + + $top1: + rx_literal P0, "b", ... + branch $top1 + + rx_oneof P0, "cd", ... + $top2: + rx_oneof P0, "cd", ... + branch $top2 + +Note that C<[cd]+> is equivalent to C<[cd][cd]*>. We take advantage of this fact. + +Next, we add C<rx_pushmark> ops at the boundaries between quantifiers, and +C<rx_pushindex> ops within the quantifiers themselves. + + RX_0: + rx_literal P0, "a", ... + + rx_pushmark P0 + $top1: + rx_literal P0, "b", ... + rx_pushindex P0 + branch $top1 + + rx_pushmark P0 + $top2: + rx_oneof P0, "cd", ... + rx_pushindex P0 + branch $top2 + +Next, we add the backtracking code. Backtracking is usually done one way. + + RX_0: + rx_literal P0, "a", ... + $back0: + branch ... + + rx_pushmark P0 + $top1: + rx_literal P0, "b", ... + rx_pushindex P0 + branch $top1 + $back1: + rx_popindex P0, ... + + rx_pushmark P0 + $top2: + rx_oneof P0, "cd", ... + rx_pushindex P0 + branch $top2 + $back2: + rx_popindex P0, ... + +After that, we'll add some skeleton code. + + RX_0: + rx_setprops P0, "i", 2 + branch $start0 + $advance: + rx_advance P0, ... + + $start0: + rx_literal P0, "a", ... + $back0: + branch ... + #if there's no quantifier, you just fall back to the + # previous backtrack + + rx_pushmark P0 + $top1: + rx_literal P0, "b", ... + rx_pushindex P0 + branch $top1 + $back1: + rx_popindex P0, ... + + rx_pushmark P0 + rx_oneof P0, "cd", ... + $top2: + rx_oneof P0, "cd", ... + rx_pushindex P0 + branch $top2 + $back2: + rx_popindex P0, ... + + rx_succeed P0 + ret + $fail: + rx_fail P0 + ret + +Now that that's done, we'll connect the dots (literally). + + RX_0: + rx_setprops P0, "i", 2 + branch $start0 + $advance: + rx_advance P0, $fail + $start0: + rx_literal P0, "a", $advance + $back0: + branch $advance + + $start1: + rx_pushmark P0 + $top1: + #when you're looping in a quantifier and a match operation fails, + # you should simply move on to the next step of the match. + rx_literal P0, "b", $start2 + rx_pushindex P0 + branch $top1 + $back1: + rx_popindex P0, $back0 + + $start2: + rx_literal P0, "cd", $back1 + rx_pushmark P0 + $top2: + rx_oneof P0, "cd", $succeed + rx_pushindex P0 + branch $top2 + $back2: + rx_popindex P0, $back1 + + $succeed: + rx_succeed P0 + ret + $fail: + rx_fail P0 + ret + +Notice how C<$advance> serves as a sort of C<$back-1>--it serves as the default +"backtracker". Note also how only the C<rx_advance> uses the C<$fail> label--that's +how +it backtracks. + +The final step is optimization. In this case, there are two things we can optimize. +First of all, that C<$back0> is pointless--we're better off just deleting it and +branching +to C<$advance> directly. Second, C<$back2> will I<never> be called, so we can get +rid of +it too. + + RX_0: + rx_setprops P0, "i", 2 + branch $start0 + $advance: + rx_advance P0, $fail + $start0: + rx_literal P0, "a", $advance + + $start1: + rx_pushmark P0 + $top1: + rx_literal P0, "b", $start2 + rx_pushindex P0 + branch $top1 + $back1: + rx_popindex P0, $advance + + $start2: + rx_literal P0, "cd", $back1 + $top2: + rx_oneof P0, "cd", $succeed + branch $top2 + + $succeed: + rx_succeed P0 + ret + $fail: + rx_fail P0 + ret + +We've now written the regular expression itself; the one thing left to do is write +the +code that calls it. Let's say the Perl code looks like this: + + unless("zzabbBBBBBBcdcdcdzz" =~ /ab*[cd]+/i) { + print "no "; + } + + print "match"; + +Then the Parrot code would be something like this: + + rx_allocateinfo P0, "zzabbBBBBBBcdcdcdzz" + bsr RX_0 + rx_info_successful P0, I0 + rx_freeinfo P0 + if I0, $match + print "no " + $match: + print "match" + end + +Congratulations--you've now written your first regular expression with Rx3. That +wasn't +so hard, now was it? + +=head3 Common constructs + +The list below gives simple templates for common quantifiers operations. + +=over 4 + +=item C<x*> + + $start: + rx_pushmark P0 + $loop: + rx_pushindex P0 + rx_literal P0, "x", $next + branch $loop + $back: + rx_popindex P0, $lastback + branch $next + +=item C<x+> + + $start: + rx_literal P0, "x", $lastback + rx_pushmark P0 + $loop: + rx_pushindex P0 + rx_literal P0, $next + branch $loop + $back: + rx_popindex P0, $lastback + branch $next + +=item C<x?> + + $start: + rx_pushmark P0 + rx_literal P0, "x", $next + rx_pushindex P0 + branch $next + $back: + rx_popindex P0, $lastback + branch $next + +=item C<x*?> + + $start: + branch $next + $back: + rx_literal P0, "x", $lastback + branch $next + +=item C<x+?> + + $start: + rx_literal P0, "x", $lastback + branch $next + $back: + rx_literal P0, "x", $lastback + branch $next + +=item C<x??> + + $start: + set I0, 0 #I0 used to make sure we haven't backtracked before + branch $next + $back: + if I0, $lastback + rx_literal P0, "x", $lastback + branch $next + +=item C<x|y> + + $start: + rx_pushmark P0 + rx_pushindex P0 + rx_literal P0, "x", $nextalt + branch $next + $nextalt: + rx_popindex P0 + rx_literal P0, "x", $back #no, that's not a typo + branch $next + $back: + rx_popmark P0 + branch $lastback + +=item C<(?=x)> + + set P1, P0 + rx_cloneinfo P0 + + rx_literal P0, "x", $lastback + + set P0, P1 + +=back + +B<XXX> Finish this documentation. + +=head1 BUGS + +=over 4 + +=item * + +This code currently requires everything to be in an eight-bit encoding compatible +with ASCII. + +=item * + +Many modifiers are not currently respected. + +=item * + +There are undoubtably many more in code this complicated. + +=back + +=head1 AUTHORS + +Copyright (C) 2001-2002 The Parrot Team <[EMAIL PROTECTED]>. + +Initial version by Brent Dax <[EMAIL PROTECTED]>; special thanks to Angel +Faus <[EMAIL PROTECTED]> and Jeff 'japhy' Pinyan <[EMAIL PROTECTED]> for major help, +especially with decisions on the architecture of the engine. + +=cut \ No newline at end of file --- /dev/null Wed Jan 9 03:01:06 2002 +++ /parrot/t/op/rx.t Mon Jan 7 22:31:14 2002 @@ -0,0 +1,203 @@ +use Parrot::Test tests => 20; + +sub gentest($$;$$) { + $_[2] ||= ""; + $_[3] ||= 0; + + return <<"END"; + set S0, "$_[0]" + rx_allocinfo P0, S0 + bsr RX_0 + rx_info_successful P0, I0 + if I0, \$yup + print "no match\\n" + end + \$yup: + rx_info_getstartindex P0, I1 + rx_info_getindex P0, I2 + length I3, S0 + + rx_freeinfo P0 + + substr S1, S0, 0, I1 + sub I4, I2, I1 + substr S2, S0, I1, I4 + sub I4, I3, I2 + substr S3, S0, I2, I4 + + print "<" + print S1 + print "><" + print S2 + print "><" + print S3 + print ">\\n" + + end + + RX_0: + rx_setprops P0, "$_[2]", $_[3] + branch \$start + \$advance: + rx_advance P0, \$fail + \$start: + $_[1] + + rx_succeed P0 + ret + \$fail: + rx_fail P0 + ret +END +} + +output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'A is A'); + rx_literal P0, "a", $advance +CODE +<><a><> +OUTPUT + +output_is(gentest('b', <<'CODE'), <<'OUTPUT', 'A is not B'); + rx_literal P0, "a", $advance +CODE +no match +OUTPUT + +output_is(gentest('ba', <<'CODE'), <<'OUTPUT', 'inching through the string'); + rx_literal P0, "a", $advance +CODE +<b><a><> +OUTPUT + +output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'character classes (successful)'); + rx_oneof P0, "aeiou", $advance +CODE +<><a><> +OUTPUT + +output_is(gentest('b', <<'CODE'), <<'OUTPUT', 'character classes (failure)'); + rx_oneof P0, "aeiou", $advance +CODE +no match +OUTPUT + +output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'dot (success)'); + rx_dot P0, $advance +CODE +<><a><> +OUTPUT + +output_is(gentest('\n', <<'CODE'), <<'OUTPUT', 'dot (failure)'); + rx_dot P0, $advance +CODE +no match +OUTPUT + +output_is(gentest('aA9_', <<'CODE'), <<'OUTPUT', '\w (success)'); + rx_is_w P0, $advance + rx_is_w P0, $advance + rx_is_w P0, $advance + rx_is_w P0, $advance +CODE +<><aA9_><> +OUTPUT + +output_is(gentest('?', <<'CODE'), <<'OUTPUT', '\w (failure)'); + rx_is_w P0, $advance +CODE +no match +OUTPUT + +output_is(gentest('ba', <<'CODE', 'r'), <<'OUTPUT', 'reversed regexen (/r)'); + rx_dot P0, $advance +CODE +<b><a><> +OUTPUT + +output_is(gentest('\n', <<'CODE', 's'), <<'OUTPUT', 'single-line regexen (/s)'); + rx_dot P0, $advance +CODE +<>< +><> +OUTPUT + +output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'stack (pushindex/popindex)'); + rx_pushindex P0 + rx_literal P0, "a", $advance + rx_popindex P0, $advance +CODE +<><><a> +OUTPUT + +output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'stack (pushmark)'); + rx_pushmark P0 + rx_pushindex P0 + rx_literal P0, "a", $advance + rx_popindex P0, $advance + rx_popindex P0, $advance +CODE +no match +OUTPUT + +TODO: { + local $TODO="pending key fixes" if $^O eq "MSWin32"; +output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'groups'); + rx_startgroup P0, 0 + rx_literal P0, "a", $advance + rx_endgroup P0, 0 + + rx_info_getgroup P0, I1, I2, 0 + sub I2, I2, I1 + substr S1, S0, I1, I2 + print "(" + print S1 + print ")\n" +CODE +(a) +<><a><> +OUTPUT +} + +output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'ZWA: ^ (success)'); + rx_zwa_atbeginning P0, $advance + rx_literal P0, "a", $advance +CODE +<><a><> +OUTPUT + +output_is(gentest('b', <<'CODE'), <<'OUTPUT', 'ZWA: ^ (failure)'); + rx_zwa_atbeginning P0, $advance + rx_literal P0, "a", $advance +CODE +no match +OUTPUT + +output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'ZWA: $ (success)'); + rx_literal P0, "a", $advance + rx_zwa_atend P0, $advance +CODE +<><a><> +OUTPUT + +output_is(gentest('ab', <<'CODE'), <<'OUTPUT', 'ZWA: $ (failure)'); + rx_literal P0, "a", $advance + rx_zwa_atend P0, $advance +CODE +no match +OUTPUT + +output_is(gentest('a?', <<'CODE'), <<'OUTPUT', 'ZWA: \b (success)'); + rx_literal P0, "a", $advance + rx_zwa_boundary P0, $advance +CODE +<><a><?> +OUTPUT + +output_is(gentest('ab', <<'CODE'), <<'OUTPUT', 'ZWA: \b (failure)'); + rx_literal P0, "a", $advance + rx_zwa_boundary P0, $advance +CODE +no match +OUTPUT + +1; \ No newline at end of file --- /parrot-cvs/string.c Tue Jan 8 20:34:54 2002 +++ /parrot/string.c Tue Jan 8 22:07:40 2002 @@ -300,9 +300,15 @@ UINTVAL true_length; true_offset = (UINTVAL)offset; + + if (idx == string_length(src) && length < 1) { + return NULL; + } + if (offset < 0) { true_offset = (UINTVAL) (src->strlen + offset); } + if (true_offset > src->strlen-1) { /* 0 based... */ INTERNAL_EXCEPTION(SUBSTR_OUT_OF_STRING, "Cannot take substr outside string")