Hi all, I have developed some adittions that give Parrot a limited amount of support to regular expressions.
It all started as a little experiment to find out what the "compile down to low-level ops" thing could mean someday. The patch consists of: * 5 new opcodes: - matchexactly - matchanychar - initbrstack - clearbrstack - backtrack - savebr The first two are the ones that actually implement the matches. initbrstack, clearbrstack, backtrack, savebr are for managing the stack of pending possible matches. They use internally the integer and destination stack. * A perl package and script that implement a simple regex compiler (using YAPE::Regex by the way). The compiler currently outputs a parrot program that matches the regexp against a predefined string. It could be easily modified to proceduce something more useful. Currently, the following features are supported. * exact matches * any char (.) * nested groups (do not capture) * alternation * simple quantifires (*, + ?) There is a lot of room for improvment, either by implementing features that do not require changes in Parrot (non-greedy-quantifiers, anchors, capturing and most of regex options can be added right now) or by making the necessary changes in Parrot (support for locales are required for macros, etc..). This is not a serious patch, in the sense that there are many things missing, the ones that are supposed to work are not tested enough and even the ones that work are implemented in a way that is just wrong. I am a rather mediocre programmer, and this are the first lines of code i ever sent to a mailing list, so please be benevolent with me. :) Anyway I thought it would be interesting to share my little experiment. Sincerly, ------------------- Angel Faus [EMAIL PROTECTED]
1814a1815,1882 > ######################################## > > AUTO_OP matchexactly(sc, s, i, ic){ > > STRING* temp; > > > if (string_length($2) <= $3) { > RETREL($4); > } > > temp = string_substr(interpreter, $2, $3 , string_length($1), NULL); > > if (string_compare(interpreter, $1, temp) != 0 ) { > RETREL($4); > } > else { > $3 = $3 + string_length($1); > } > } > > AUTO_OP matchanychar(s, i, ic) { > if (string_length($1) > $2){ > $2++; > } > else { > RETREL($3); > } > } > > MANUAL_OP backtrack(i){ > opcode_t *dest; > > pop_generic_entry(interpreter, &interpreter->user_stack_top, &($1), STACK_ENTRY_INT); > pop_generic_entry(interpreter, &interpreter->control_stack_top, &dest, STACK_ENTRY_DESTINATION); > > RETABS(dest); > } > > > AUTO_OP savebr(i, ic){ > > push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + cur_opcode[2], STACK_ENTRY_DESTINATION, NULL); > > push_generic_entry(interpreter, &interpreter->user_stack_top, &($1), STACK_ENTRY_INT, NULL); > > } > > AUTO_OP initbrstack(ic) { > INTVAL i; > i = -1; > > push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + cur_opcode[1], STACK_ENTRY_DESTINATION, NULL); > push_generic_entry(interpreter, &interpreter->user_stack_top, &i, STACK_ENTRY_INT, NULL); > > } > > AUTO_OP clearbrstack(i){ > opcode_t *dest; > > while ($1 && $1 >= 0) { > pop_generic_entry(interpreter, &interpreter->control_stack_top, &dest, STACK_ENTRY_DESTINATION); > pop_generic_entry(interpreter, &interpreter->user_stack_top, &($1), STACK_ENTRY_INT); > } > > } > > 1826a1895 >
package BabyRegex; use YAPE::Regex 'BabyRegex'; use strict; use vars '$VERSION'; $VERSION = '0.01'; my %modes = ( on => '', off => '' ); sub buildtree { my $self = shift; my $cnt = 0; my ($groupscnt, @groups); my @tree; while (my $node = $self->next) { $node->id($cnt++); $tree[-1]->next($node) if @tree; if ($node->type =~ /capture|group/) { push @groups, $node; $node->{ALTS} = []; $node->{COUNT} = $groupscnt++; } if ($node->type eq "alt") { push (@{$groups[-1]->{ALTS}}, $node); my $groupnode = $groups[-1]; $node->{GROUP} = $groupnode; push @{$groupnode->{ALTS}}, $node, } if ($node->type eq "close"){ my $groupnode = pop @groups; $groupnode->{CLOSED} = $node; $node->{GROUP} = $groupnode; for my $alt (@{$groupnode->{ALTS}}) { #Alt nodes get its ID replaced by the Closing node ID, so #that the when its antecessors calls ->next->id it gets the good one. #This is probably on of the worse to do that. $alt->{ID} = $node->{ID}; } } push (@tree, $node); } return @tree; } sub cry { if (@_[1]) { my $label = shift; my $opcode = shift; my $spc = " " x (4 - length($label) ) ; print $label. ":" . $spc . $opcode . "\n"; } else { my $opcode = shift; print " $opcode\n"; } } sub pasm { my ($self, $string) = @_; my @tree = $self->buildtree; cry "INIT", "initbrstack FAIL"; cry "set I1, 0"; cry "set S1, \"$string\""; for my $node (@tree) { $node->pasm($self); #print $node->type; } cry "OK", "print \"match\""; cry "clearbrstack I1"; cry "end"; print "\n"; cry "FAIL", "print \"fail\""; cry "clearbrstack I1"; cry "end"; print "\n"; cry "BT", "backtrack I1"; print "\n"; } ## ## shared methods ## sub BabyRegex::Element::id { my $self = shift; my $id = shift; if ($self->{ID}) { return $self->{ID} } else { $self->{ID} = "L" . $id; } } sub BabyRegex::Element::next { my $self = shift; my $next = shift; if ($next) { $self->{NEXT} = $next; return $next; } else { return $self->{NEXT} } } sub BabyRegex::Element::cry_atomic { my $self = shift; my $opcode = shift; my $id = $self->id; if ($self->quant eq "*") { my $nextid = $self->next()->id(); cry $id, "savebr I1, $nextid"; cry $opcode; cry "branc $id"; } elsif ($self->quant eq "+" ) { my $nextid = $self->next()->id(); cry $id, $opcode; cry "savebr I1, $nextid"; cry "branch $id"; } elsif ($self->quant eq "?" ) { my $nextid = $self->next()->id(); cry $id, "savebr I1, $nextid"; cry $opcode; } else { cry $id, $opcode; } } ## ## each element pasm ## sub BabyRegex::anchor::pasm { my $self = shift; my $type = $self->{TEXT}; print $type; } sub BabyRegex::macro::pasm { die "unimplemented\n"; } sub BabyRegex::oct::explanation { die "unimplemented - too lazy\n"; } sub BabyRegex::hex::explanation { die "unimplemented - too lazy\n"; } sub BabyRegex::utf8hex::explanation { die "unimplemented - too lazy\n"; } sub BabyRegex::ctrl::explanation { die "unimplemented - too lazy\n"; } sub BabyRegex::named::explanation { die "unimplemented - too lazy\n"; } sub BabyRegex::Cchar::explanation { die "unimplemented - too lazy\n"; } sub BabyRegex::any::pasm { my $self = shift; my $l; my $id = $self->id; if ($modes{on} =~ /s/) { $self->cry_atomic ("matchanychar S1, I1, BT"); } else { #$self->cry_atomic ("matchanycharbutnl S1, I1, BT"); #we don't have the opcode anyway $self->cry_atomic ("matchanychar S1, I1, BT"); } } sub BabyRegex::text::pasm { my $self = shift; my $text = $self->text; $text =~ s/\n/\\n/g; $text =~ s/\r/\\r/g; $text =~ s/\t/\\t/g; $text =~ s/\f/\\f/g; $text =~ s/'/\\'/g; my $id = $self->id(); $self->cry_atomic ("matchexactly \"$text\", S1, I1, BT"); } sub BabyRegex::alt::pasm { my $self = shift; my $id = $self->id(); my $endofgroup_id = $self->{GROUP}->{CLOSED}->id; cry("branch $endofgroup_id"); } sub BabyRegex::slash::pasm { die "unimplemented\n"; } sub BabyRegex::class::pasm { die "unimplemented\n"; } sub BabyRegex::group::pasm{ my $self = shift; my $id = $self->id; my $cnt = $self->{COUNT}; my $fs = substr($self->fullstring,1,30); print "\n"; cry $id, "#start of n.c. group $cnt ($fs...)"; if ($self->quant eq "*" or $self->quant eq "?") { cry "savebr I1, ". $self->{CLOSED}->next->id(); } foreach my $alt (@{$self->{ALTS}}) { cry "savebr I1, " . $alt->next->id(); } } sub BabyRegex::capture::pasm { # We are not capturing anything yet! my $self = shift; my $id = $self->id; my $cnt = $self->{COUNT}; my $fs = substr($self->fullstring,1,30); print "\n"; if ($self->quant eq "*" or $self->quant eq "?") { cry "savebr I1, ". $self->{CLOSED}->next->id(); } cry $id, "#start of group $cnt ($fs...)"; foreach my $alt (@{$self->{ALTS}}) { cry "savebr I1, ". $alt->next->id(); } } sub BabyRegex::close::pasm { my $self = shift; my $id = $self->id; my $cnt = $self->{GROUP}->{COUNT}; cry $id, "#end of group $cnt"; if ($self->{GROUP}->quant eq "*" or $self->{GROUP}->quant eq "+") { cry "savebr I1, " . $self->next->id(); cry "branch " . $self->{GROUP}->id; } print "\n"; } sub BabyRegex::comment::pasm { } sub BabyRegex::whitespace::pasm{ } sub BabyRegex::lookahead::explanation { die "unimplemented\n"; } sub BabyRegex::lookbehind::explanation { die "unimplemented\n"; } sub BabyRegex::code::pasm { die "unimplemented\n"; } sub BabyRegex::later::pasm { die "unimplemented\n"; } sub BabyRegex::conditional::pasm { die "unimplemented\n"; } sub BabyRegex::cut::pasm { die "unimplemented\n"; } sub BabyRegex::flags::pasm{ die "unimplemented\n"; } sub BabyRegex::backref::pasm { die "unimplemented \n"; } 1; __END__ =head1 NAME BabyRegex - compiles a regular expression down to Parrot bytecode =head1 SYNOPSIS use BabyRegex; BabyRegex->new($REx)->pasm; =head1 SEE ALSO The C<YAPE::Regex> documentation. =head1 AUTHOR Angel Faus [EMAIL PROTECTED] Based in YAPE::Regex::Explain by Jeff Pinyan ([EMAIL PROTECTED]) =cut
use BabyRegex; unless (@ARGV[0] & @ARGV[1]) { print 'usage: perl babyre.pl "pattern" "string"' . "\n"; print 'ex: perl babyre.pl "reg(exp?|ular +expression)?" "regex" > regex.pasm' . "\n"; exit; } $pattern = @ARGV[0]; $string = @ARGV[1]; $c = BabyRegex->new($pattern); $c->pasm($string);