Pretty simple. Mucks with the internals of regexes to use bitmaps in rx_oneof, rx_is_w, rx_is_s, rx_dot, and rx_zwa_boundary. Also adds three new ops:
-rx_clearinfo: clears out an info structure (useful for speeding up more than one regex call). -rx_makebmp: prebuilds a bitmap. -rx_oneof_bmp: uses a prebuilt bitmap. This also recognizes that test 14 of rx.t no longer fails under Win32--it revokes its TODO status. This patch is NOT bytecode-compatible with the last version of regexes. --Brent Dax [EMAIL PROTECTED] Parrot Configure pumpking and regex hacker <obra> mmmm. hawt sysadmin chx0rs <lathos> This is sad. I know of *a* hawt sysamin chx0r. <obra> I know more than a few. <lathos> obra: There are two? Are you sure it's not the same one?
diff -ur ..\..\parrot-cvs\parrot/include/parrot/rx.h ../include/parrot/rx.h --- ..\..\parrot-cvs\parrot/include/parrot/rx.h Mon Jan 14 01:19:30 2002 +++ ./include/parrot/rx.h Mon Jan 14 01:23:22 2002 @@ -17,6 +17,8 @@ #include "parrot/parrot.h" +typedef char* Bitmap; + typedef enum rxflags { enum_rxflags_none=0, enum_rxflags_case_insensitive=1, @@ -31,6 +33,11 @@ } rxdirection; extern const INTVAL RX_MARK; +extern const char * RX_WORDCHARS; +extern const char * RX_NUMCHARS; +extern const char * RX_SPACECHARS; + +#define cstr2pstr(cstr) string_make(interpreter, cstr, strlen(cstr), 0, 0, 0) typedef struct rxinfo { STRING *string; @@ -52,9 +59,12 @@ rxinfo * rx_allocate_info(struct Parrot_Interp *, STRING *); -BOOLVAL rx_is_word_character(char ch); -BOOLVAL rx_is_number_character(char ch); -BOOLVAL rx_is_whitespace_character(char ch); +BOOLVAL rx_is_word_character(struct Parrot_Interp *, INTVAL ch); +BOOLVAL rx_is_number_character(struct Parrot_Interp *, INTVAL ch); +BOOLVAL rx_is_whitespace_character(struct Parrot_Interp *, INTVAL ch); + +Bitmap make_bitmap(STRING* str); +BOOLVAL check_bitmap(INTVAL ch, Bitmap bmp); STRING *rxP_get_substr(struct Parrot_Interp *, STRING *, INTVAL, INTVAL); diff -ur ..\..\parrot-cvs\parrot/rx.c ./rx.c --- ..\..\parrot-cvs\parrot/rx.c Thu Jan 10 16:04:54 2002 +++ ./rx.c Mon Jan 14 01:21:46 2002 @@ -15,6 +15,9 @@ #include "parrot/rx.h" const INTVAL RX_MARK=-1; +const char * +RX_WORDCHARS="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; +const char * RX_NUMCHARS="0123456789"; +const char * RX_SPACECHARS="\r\n\t "; /************************************************************* * Parrot Regular Expression Engine, version 3.0 alpha (Rx3) * @@ -39,21 +42,19 @@ return rx; } -BOOLVAL rx_is_word_character(char ch) { - if( - (ch >= 'a' && ch <= 'z') || - (ch >= 'A' && ch <= 'Z') || - (ch >= '0' && ch <= '9') || - ch == '_' - ) { - return 1; - } - else { - return 0; +BOOLVAL rx_is_word_character(struct Parrot_Interp *interpreter, INTVAL ch) { + static Bitmap bmp=NULL; + + if(!bmp) { + bmp=make_bitmap(cstr2pstr(RX_WORDCHARS)); } + + return check_bitmap(ch, bmp); } -BOOLVAL rx_is_number_character(char ch) { +BOOLVAL rx_is_number_character(struct Parrot_Interp *interpreter, INTVAL ch) { + /* faster to do less-than/greater-than */ + if(ch >= '0' && ch <= '9') { return 1; } @@ -62,13 +63,14 @@ } } -BOOLVAL rx_is_whitespace_character(char ch) { - if(ch == '\n' || ch == '\r' || ch == '\t' || ch == ' ') { - return 1; - } - else { - return 0; +BOOLVAL rx_is_whitespace_character(struct Parrot_Interp *interpreter, INTVAL ch) { + static Bitmap bmp=NULL; + + if(!bmp) { + bmp=make_bitmap(cstr2pstr(RX_SPACECHARS)); } + + return check_bitmap(ch, bmp); } STRING *rxP_get_substr(struct Parrot_Interp *interpreter, STRING * source, INTVAL startindex, INTVAL length) { @@ -82,3 +84,33 @@ return ret; } + +Bitmap make_bitmap(STRING* str) { + INTVAL i, ch; + Bitmap bmp=mem_sys_allocate(32); + + /* XXX Bitmaps currently do not support chars > 255 -- + the memory space needed grows _very_ fast */ + + for(i=0; i < 32; i++) { + bmp[i]=0; + } + + for(i=0; i < string_length(str); i++) { + ch=string_ord(str, i); + + if(ch > 255) return NULL; /* Can't do it--caller must figure out +another way */ + + bmp[ch>>3] |= (1<<(ch & 7)); + } + + return bmp; +} + +BOOLVAL check_bitmap(INTVAL ch, Bitmap bmp) { + if(ch > 255) { + return 0; + } + + return bmp[ch>>3] & (1<<(ch & 7)); +} \ No newline at end of file diff -ur ..\..\parrot-cvs\parrot/rx.ops ./rx.ops --- ..\..\parrot-cvs\parrot/rx.ops Mon Jan 14 01:19:28 2002 +++ ./rx.ops Mon Jan 14 01:31:10 2002 @@ -207,6 +207,23 @@ goto NEXT(); } +op rx_clearinfo(inout pmc, in str) { + RX_dUNPACK($1); + rx->index=rx->startindex=0; + rx->flags=enum_rxflags_none; + rx->success=0; + rx->minlength=0; + rx->whichway=enum_rxdirection_forwards; + + while(stack_depth(interpreter, rx->stack)) { + stack_pop(interpreter, rx->stack, NULL, STACK_ENTRY_INT); + } + + rx->string=$2; + + goto NEXT(); +} + ######################################## =item C<rx_freeinfo>(inout pmc) @@ -230,13 +247,26 @@ structure in another register, the stack, or a symbol table entry before calling this opcode. -B<XXX> Currently this op has not been implemented. +This opcode actually creates a new stack for the new regular expression structure, but +all other fields (including the group-related ones) stay the same. It's primarily +used +for things like lookaheads and lookbehinds, where the regex's state should be +completely +restored to the original version if the match succeeds. (Well, almost +completely--groups +matched with a cloned structure live on in the original.) =cut op rx_cloneinfo(inout pmc) { + rxinfo *rx2; RX_dUNPACK($1); + + rx2=rx_allocate_info(interpreter, rx->string); + *rx2=*rx; + + rx2->stack=new_stack(interpreter); + $1=pmc_new(interpreter, enum_class_ParrotPointer); + $1->data=rx2; + goto NEXT(); } @@ -376,7 +406,6 @@ op rx_pushmark(in pmc) { RX_dUNPACK($1); - /* Don't worry about the const * to void * cast on the next line */ stack_push(interpreter, rx->stack, (void *)&RX_MARK, STACK_ENTRY_INT, NULL); goto NEXT(); @@ -651,8 +680,8 @@ RX_dUNPACK($1); RxAssertMore(rx, $2); - - if(rx_is_word_character(RxCurChar(rx))) { + + if(rx_is_word_character(interpreter, RxCurChar(rx))) { RxAdvance(rx); goto NEXT(); } @@ -661,6 +690,7 @@ } } + ######################################## =item C<rx_is_n>(in pmc, in int) @@ -674,7 +704,7 @@ RxAssertMore(rx, $2); - if(rx_is_number_character(RxCurChar(rx))) { + if(rx_is_number_character(interpreter, RxCurChar(rx))) { RxAdvance(rx); goto NEXT(); } @@ -696,7 +726,7 @@ RxAssertMore(rx, $2); - if(rx_is_whitespace_character(RxCurChar(rx))) { + if(rx_is_whitespace_character(interpreter, RxCurChar(rx))) { RxAdvance(rx); goto NEXT(); } @@ -722,18 +752,19 @@ op rx_oneof(in pmc, in str, in int) { RX_dUNPACK($1); - STRING *ch1; - STRING *ch2; - INTVAL i; - - /* XXX In the future, this ought to use bitmaps. */ - + Bitmap bitmap; + RxAssertMore(rx, $3); - ch1=RxCurCharS(rx); + bitmap=make_bitmap($2); - if(string_length($2) < 8) { /* XXX run benchmarks to find a good value */ - /* modified linear search--slow, but zero overhead */ + if(!bitmap) { + INTVAL i; + STRING *ch1; + STRING *ch2; + + ch1=RxCurCharS(rx); + for(i=0; i < string_length($2); i++) { ch2=rxP_get_substr(interpreter, $2, i, 1); @@ -745,41 +776,63 @@ goto OFFSET($3); } } + + goto OFFSET($3); } else { - /* binary search--fast but complicated */ - INTVAL upper, lower=0, index=0, lastindex=-1, cmp; - - upper=string_length($2); - - while(upper > lower) { - index=(upper+lower)/2; - - if(index==lastindex) { - goto OFFSET($3); - } - else if(index==string_length($2)) { - goto OFFSET($3); - } - - cmp=string_compare(interpreter, RxCurCharS(rx), rxP_get_substr(interpreter, $2, index, 1)); - - if(0==cmp) { - RxAdvance(rx); - goto NEXT(); - } - else if(0 > cmp) { - upper=index; - } - else { - lower=index; - } - - lastindex=index; + if(check_bitmap(RxCurChar(rx), bitmap)) { + mem_sys_free(bitmap); + RxAdvance(rx); + goto NEXT(); + } + else { + goto OFFSET($3); } } +} + + +######################################## + +=item C<rx_oneof_bmp>(in pmc, in pmc, in int) + +This op has the exact same behavior as C<rx_oneof>, except that the second parameter +is a ParrotPointer to a bitmap generated by C<rx_makebmp>. + +=cut + +op rx_oneof_bmp(in pmc, in pmc, in int) { + RX_dUNPACK($1); + + RxAssertMore(rx, $3); - goto OFFSET($3); + if(check_bitmap(RxCurChar(rx), $2->data)) { + RxAdvance(rx); + goto NEXT(); + } + else { + goto OFFSET($3); + } +} + +####################################### + +=item C<rx_makebmp>(out pmc, in str) + +This op pre-generates bitmaps to be used with C<rx_oneof_bmp>, increasing performance. +The first parameter will be set to a ParrotPointer to the bitmap; the second parameter +is the string to be bitmapped. + +Note that bitmaps are currently NOT compatible with characters above 255 (as defined +by +whatever character set you're using). This may change in the future. + +=cut + +op rx_makebmp(out pmc, in str) { + $1=pmc_new(interpreter, enum_class_ParrotPointer); + $1->data=(void*)make_bitmap($2); + + goto NEXT(); } ######################################## @@ -792,6 +845,7 @@ =cut op rx_dot(in pmc, in int) { + static Bitmap bmp; RX_dUNPACK($1); RxAssertMore(rx, $2); @@ -801,10 +855,11 @@ goto NEXT(); } else { - STRING *ch=RxCurCharS(rx); - STRING *nl=string_make(interpreter, "\n", 1, 0, 0, 0); + if(!bmp) { + bmp=make_bitmap(cstr2pstr("\r\n")); + } - if(string_compare(interpreter, ch, nl)!=0) { + if(!check_bitmap(RxCurChar(rx), bmp)) { RxAdvance(rx); goto NEXT(); } @@ -825,14 +880,14 @@ op rx_zwa_boundary(in pmc, in int) { RX_dUNPACK($1); - char ch1, ch2; + BOOLVAL one, two; - ch1=RxCurChar(rx); + one=rx_is_word_character(interpreter, RxCurChar(rx)); RxAdvanceX(rx, -1); - ch2=RxCurChar(rx); + two=rx_is_word_character(interpreter, RxCurChar(rx)); RxAdvance(rx); - if(rx_is_word_character(ch1) == rx_is_word_character(ch2)) { + if(one && two || !one && !two) { goto OFFSET($2); } --- ..\..\parrot-cvs\parrot/t/op/rx.t Thu Jan 10 16:04:56 2002 +++ ./t/op/rx.t Mon Jan 14 01:35:16 2002 @@ -139,8 +139,6 @@ no match OUTPUT -TODO: { - local $TODO="pending key fixes" if $^O eq "MSWin32"; output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'groups'); rx_startgroup P0, 0 rx_literal P0, "a", $advance @@ -156,7 +154,6 @@ (a) <><a><> OUTPUT -} output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'ZWA: ^ (success)'); rx_zwa_atbeginning P0, $advance