Pretty simple.  Mucks with the internals of regexes to use bitmaps in
rx_oneof, rx_is_w, rx_is_s, rx_dot, and rx_zwa_boundary.  Also adds
three new ops:

        -rx_clearinfo: clears out an info structure (useful for speeding up
more than one regex call).
        -rx_makebmp: prebuilds a bitmap.
        -rx_oneof_bmp: uses a prebuilt bitmap.

This also recognizes that test 14 of rx.t no longer fails under
Win32--it revokes its TODO status.

This patch is NOT bytecode-compatible with the last version of regexes.

--Brent Dax
[EMAIL PROTECTED]
Parrot Configure pumpking and regex hacker

<obra> mmmm. hawt sysadmin chx0rs
<lathos> This is sad. I know of *a* hawt sysamin chx0r.
<obra> I know more than a few.
<lathos> obra: There are two? Are you sure it's not the same one?
diff -ur ..\..\parrot-cvs\parrot/include/parrot/rx.h ../include/parrot/rx.h
--- ..\..\parrot-cvs\parrot/include/parrot/rx.h Mon Jan 14 01:19:30 2002
+++ ./include/parrot/rx.h       Mon Jan 14 01:23:22 2002
@@ -17,6 +17,8 @@
 
 #include "parrot/parrot.h"
 
+typedef char* Bitmap;
+
 typedef enum rxflags {
        enum_rxflags_none=0,
        enum_rxflags_case_insensitive=1,
@@ -31,6 +33,11 @@
 } rxdirection;
 
 extern const INTVAL RX_MARK;
+extern const char * RX_WORDCHARS;
+extern const char * RX_NUMCHARS;
+extern const char * RX_SPACECHARS;
+
+#define cstr2pstr(cstr) string_make(interpreter, cstr, strlen(cstr), 0, 0, 0)
 
 typedef struct rxinfo {
        STRING *string;
@@ -52,9 +59,12 @@
 
 rxinfo * rx_allocate_info(struct Parrot_Interp *, STRING *);
 
-BOOLVAL  rx_is_word_character(char ch);
-BOOLVAL  rx_is_number_character(char ch);
-BOOLVAL  rx_is_whitespace_character(char ch);
+BOOLVAL  rx_is_word_character(struct Parrot_Interp *, INTVAL ch);
+BOOLVAL  rx_is_number_character(struct Parrot_Interp *, INTVAL ch);
+BOOLVAL  rx_is_whitespace_character(struct Parrot_Interp *, INTVAL ch);
+
+Bitmap make_bitmap(STRING* str);
+BOOLVAL check_bitmap(INTVAL ch, Bitmap bmp);
 
 STRING *rxP_get_substr(struct Parrot_Interp *, STRING *, INTVAL, INTVAL);
 
diff -ur ..\..\parrot-cvs\parrot/rx.c ./rx.c
--- ..\..\parrot-cvs\parrot/rx.c        Thu Jan 10 16:04:54 2002
+++ ./rx.c      Mon Jan 14 01:21:46 2002
@@ -15,6 +15,9 @@
 #include "parrot/rx.h"
 
 const INTVAL RX_MARK=-1;
+const char * 
+RX_WORDCHARS="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
+const char * RX_NUMCHARS="0123456789";
+const char * RX_SPACECHARS="\r\n\t ";
 
 /*************************************************************
  * Parrot Regular Expression Engine, version 3.0 alpha (Rx3) *
@@ -39,21 +42,19 @@
        return rx;
 }
 
-BOOLVAL rx_is_word_character(char ch) {
-       if(
-               (ch >= 'a' && ch <= 'z') ||
-               (ch >= 'A' && ch <= 'Z') ||
-               (ch >= '0' && ch <= '9') ||
-               ch == '_'
-       ) {
-               return 1;
-       }
-       else {
-               return 0;
+BOOLVAL rx_is_word_character(struct Parrot_Interp *interpreter, INTVAL ch) {
+       static Bitmap bmp=NULL;
+       
+       if(!bmp) {
+               bmp=make_bitmap(cstr2pstr(RX_WORDCHARS));
        }
+       
+       return check_bitmap(ch, bmp);
 }
 
-BOOLVAL rx_is_number_character(char ch) {
+BOOLVAL rx_is_number_character(struct Parrot_Interp *interpreter, INTVAL ch) {
+       /* faster to do less-than/greater-than */
+       
        if(ch >= '0' && ch <= '9') {
                return 1;
        }
@@ -62,13 +63,14 @@
        }
 }
 
-BOOLVAL rx_is_whitespace_character(char ch) {
-       if(ch == '\n' || ch == '\r' || ch == '\t' || ch == ' ') {
-               return 1;
-       }
-       else {
-               return 0;
+BOOLVAL rx_is_whitespace_character(struct Parrot_Interp *interpreter, INTVAL ch) {
+       static Bitmap bmp=NULL;
+       
+       if(!bmp) {
+               bmp=make_bitmap(cstr2pstr(RX_SPACECHARS));
        }
+
+       return check_bitmap(ch, bmp);
 }
 
 STRING *rxP_get_substr(struct Parrot_Interp *interpreter, STRING * source, INTVAL 
startindex, INTVAL length) {
@@ -82,3 +84,33 @@
 
        return ret;
 }
+
+Bitmap make_bitmap(STRING* str) {
+       INTVAL i, ch;
+       Bitmap bmp=mem_sys_allocate(32);
+       
+       /* XXX Bitmaps currently do not support chars > 255 -- 
+          the memory space needed grows _very_ fast */
+       
+       for(i=0; i < 32; i++) {
+               bmp[i]=0;
+       }
+       
+       for(i=0; i < string_length(str); i++) {
+               ch=string_ord(str, i);
+
+               if(ch > 255) return NULL; /* Can't do it--caller must figure out 
+another way */
+
+               bmp[ch>>3] |= (1<<(ch & 7));
+       }
+       
+       return bmp;
+}
+
+BOOLVAL check_bitmap(INTVAL ch, Bitmap bmp) {
+       if(ch > 255) {
+               return 0;
+       }
+       
+       return bmp[ch>>3] & (1<<(ch & 7));
+}
\ No newline at end of file
diff -ur ..\..\parrot-cvs\parrot/rx.ops ./rx.ops
--- ..\..\parrot-cvs\parrot/rx.ops      Mon Jan 14 01:19:28 2002
+++ ./rx.ops    Mon Jan 14 01:31:10 2002
@@ -207,6 +207,23 @@
        goto NEXT();
 }
 
+op rx_clearinfo(inout pmc, in str) {
+       RX_dUNPACK($1);
+       rx->index=rx->startindex=0;
+       rx->flags=enum_rxflags_none;
+       rx->success=0;
+       rx->minlength=0;
+       rx->whichway=enum_rxdirection_forwards;
+       
+       while(stack_depth(interpreter, rx->stack)) {
+               stack_pop(interpreter, rx->stack, NULL, STACK_ENTRY_INT);
+       }
+       
+       rx->string=$2;
+       
+       goto NEXT();
+}
+
 ########################################
 
 =item C<rx_freeinfo>(inout pmc)
@@ -230,13 +247,26 @@
 structure in another register, the stack, or a symbol table entry before calling this
 opcode.
 
-B<XXX> Currently this op has not been implemented.
+This opcode actually creates a new stack for the new regular expression structure, but
+all other fields (including the group-related ones) stay the same.  It's primarily 
+used
+for things like lookaheads and lookbehinds, where the regex's state should be 
+completely
+restored to the original version if the match succeeds.  (Well, almost 
+completely--groups
+matched with a cloned structure live on in the original.)
 
 =cut
 
 op rx_cloneinfo(inout pmc) {
+       rxinfo *rx2;
        RX_dUNPACK($1);
+       
+       rx2=rx_allocate_info(interpreter, rx->string);
+       *rx2=*rx;
+       
+       rx2->stack=new_stack(interpreter);
 
+       $1=pmc_new(interpreter, enum_class_ParrotPointer);
+       $1->data=rx2;
+       
        goto NEXT();
 }
 
@@ -376,7 +406,6 @@
 op rx_pushmark(in pmc) {
        RX_dUNPACK($1);
 
-       /* Don't worry about the const * to void * cast on the next line */
        stack_push(interpreter, rx->stack, (void *)&RX_MARK, STACK_ENTRY_INT, NULL);
        
        goto NEXT();
@@ -651,8 +680,8 @@
        RX_dUNPACK($1);
 
        RxAssertMore(rx, $2);
-
-       if(rx_is_word_character(RxCurChar(rx))) {
+       
+       if(rx_is_word_character(interpreter, RxCurChar(rx))) {
                RxAdvance(rx);
                goto NEXT();
        }
@@ -661,6 +690,7 @@
        }
 }
 
+
 ########################################
 
 =item C<rx_is_n>(in pmc, in int)
@@ -674,7 +704,7 @@
 
        RxAssertMore(rx, $2);
 
-       if(rx_is_number_character(RxCurChar(rx))) {
+       if(rx_is_number_character(interpreter, RxCurChar(rx))) {
                RxAdvance(rx);
                goto NEXT();
        }
@@ -696,7 +726,7 @@
 
        RxAssertMore(rx, $2);
 
-       if(rx_is_whitespace_character(RxCurChar(rx))) {
+       if(rx_is_whitespace_character(interpreter, RxCurChar(rx))) {
                RxAdvance(rx);
                goto NEXT();
        }
@@ -722,18 +752,19 @@
 
 op rx_oneof(in pmc, in str, in int) {
        RX_dUNPACK($1);
-       STRING *ch1;
-       STRING *ch2;
-       INTVAL i;
-       
-       /* XXX In the future, this ought to use bitmaps. */
-       
+       Bitmap bitmap;
+
        RxAssertMore(rx, $3);
        
-       ch1=RxCurCharS(rx);
+       bitmap=make_bitmap($2); 
        
-       if(string_length($2) < 8) { /* XXX run benchmarks to find a good value */
-               /* modified linear search--slow, but zero overhead */
+       if(!bitmap) {
+               INTVAL i;
+               STRING *ch1;
+               STRING *ch2;
+
+               ch1=RxCurCharS(rx);
+               
                for(i=0; i < string_length($2); i++) {
                        ch2=rxP_get_substr(interpreter, $2, i, 1);
                
@@ -745,41 +776,63 @@
                                goto OFFSET($3);
                        }
                }
+
+               goto OFFSET($3);
        }
        else {
-               /* binary search--fast but complicated */
-               INTVAL upper, lower=0, index=0, lastindex=-1, cmp;
-
-               upper=string_length($2);
-               
-               while(upper > lower) {
-                       index=(upper+lower)/2;
-                       
-                       if(index==lastindex) {
-                               goto OFFSET($3);        
-                       }
-                       else if(index==string_length($2)) {
-                               goto OFFSET($3);
-                       }
-               
-                       cmp=string_compare(interpreter, RxCurCharS(rx), 
rxP_get_substr(interpreter, $2, index, 1));
-                       
-                       if(0==cmp) {
-                               RxAdvance(rx);
-                               goto NEXT();
-                       }
-                       else if(0 > cmp) {
-                               upper=index;
-                       }
-                       else {
-                               lower=index;
-                       }
-                       
-                       lastindex=index;
+               if(check_bitmap(RxCurChar(rx), bitmap)) {
+                       mem_sys_free(bitmap);
+                       RxAdvance(rx);
+                       goto NEXT();
+               }
+               else {
+                       goto OFFSET($3);
                }
        }
+}
+
+
+########################################
+
+=item C<rx_oneof_bmp>(in pmc, in pmc, in int)
+
+This op has the exact same behavior as C<rx_oneof>, except that the second parameter 
+is a ParrotPointer to a bitmap generated by C<rx_makebmp>.
+
+=cut
+
+op rx_oneof_bmp(in pmc, in pmc, in int) {
+       RX_dUNPACK($1);
+       
+       RxAssertMore(rx, $3);
        
-       goto OFFSET($3);
+       if(check_bitmap(RxCurChar(rx), $2->data)) {
+               RxAdvance(rx);
+               goto NEXT();
+       }
+       else {
+               goto OFFSET($3);
+       }
+}
+
+#######################################
+
+=item C<rx_makebmp>(out pmc, in str)
+
+This op pre-generates bitmaps to be used with C<rx_oneof_bmp>, increasing performance.
+The first parameter will be set to a ParrotPointer to the bitmap; the second parameter
+is the string to be bitmapped.
+
+Note that bitmaps are currently NOT compatible with characters above 255 (as defined 
+by
+whatever character set you're using).  This may change in the future.
+
+=cut
+
+op rx_makebmp(out pmc, in str) {
+       $1=pmc_new(interpreter, enum_class_ParrotPointer);
+       $1->data=(void*)make_bitmap($2);
+       
+       goto NEXT();
 }
 
 ########################################
@@ -792,6 +845,7 @@
 =cut
 
 op rx_dot(in pmc, in int) {
+       static Bitmap bmp;
        RX_dUNPACK($1);
 
        RxAssertMore(rx, $2);
@@ -801,10 +855,11 @@
                goto NEXT();
        }
        else {
-               STRING *ch=RxCurCharS(rx);
-               STRING *nl=string_make(interpreter, "\n", 1, 0, 0, 0);
+               if(!bmp) {
+                       bmp=make_bitmap(cstr2pstr("\r\n"));
+               }
 
-               if(string_compare(interpreter, ch, nl)!=0) {
+               if(!check_bitmap(RxCurChar(rx), bmp)) {
                        RxAdvance(rx);
                        goto NEXT();
                }
@@ -825,14 +880,14 @@
 
 op rx_zwa_boundary(in pmc, in int) {
        RX_dUNPACK($1);
-       char ch1, ch2;
+       BOOLVAL one, two;
 
-       ch1=RxCurChar(rx);
+       one=rx_is_word_character(interpreter, RxCurChar(rx));
        RxAdvanceX(rx, -1);
-       ch2=RxCurChar(rx);
+       two=rx_is_word_character(interpreter, RxCurChar(rx));
        RxAdvance(rx);
 
-       if(rx_is_word_character(ch1) == rx_is_word_character(ch2)) {
+       if(one && two || !one && !two) {
                goto OFFSET($2);
        }
        
--- ..\..\parrot-cvs\parrot/t/op/rx.t   Thu Jan 10 16:04:56 2002
+++ ./t/op/rx.t Mon Jan 14 01:35:16 2002
@@ -139,8 +139,6 @@
 no match
 OUTPUT
 
-TODO: {
-       local $TODO="pending key fixes" if $^O eq "MSWin32";
 output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'groups');
                rx_startgroup P0, 0
                rx_literal P0, "a", $advance
@@ -156,7 +154,6 @@
 (a)
 <><a><>
 OUTPUT
-}
 
 output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'ZWA: ^ (success)');
                rx_zwa_atbeginning P0, $advance

Reply via email to