If anyone is interested, I've hacked together a Parrot tokenizer ... the main intention for it was to form the basis of a syntax highlighter, but I'm sure someone will find more creative applications for it...
I hacked together a very quick and dirty syntax highlighter (about 10 lines) using the Tokenizer, and the results of running on examples/assembly/life.pasm can be found here: http://grou.ch/parrot.html +Pete
# Parrot Tokenizer in Perl # Copyright 2002 Peter Sergeant <[EMAIL PROTECTED]> package Parrot::Tokenizer; use strict; my @tokens; # This whole storing a global @tokens thing is a design fault ... I didn't # think about sub tokenize { # Grab incoming data my $buffer = shift; # Clear the token cache @tokens = (); my @lines = split(/(?<=\n)/, $buffer); foreach my $line (@lines) { # First deal with preceeding white-space if ($line =~ s/^(\s+)(\S)/$2/) { token('whitespace', $1); } # Comments if (substr($line, 0, 1) eq "#") { token('comment', $line) # Blank lines } elsif ($line =~ m/^\s+$/) { token('whitespace', $line) # Labels } elsif ($line =~ s/^(\$?[a-z0-9_]+\:)//i) { token('label', $1); # Trim any preceeding whitespace if ($line =~ s/^(\s+)//) { token('whitespace', $1); } # Check to see if we're a comment line if (substr($line, 0, 1) eq "#") { token('comment', $line); # So we have a function... } elsif ($line =~ s/^([a-z0-9_]+)//i) { token('operation', $1); arguments( $line ) } # operation } elsif ($line =~ s/^([a-z0-9_]+)//i) { token('operation', $1); arguments($line); } } return @tokens; } # Process operation arguments sub arguments { my $line = shift; while ($line) { # Whitespace checks if ($line =~ s/^(\s+)(,?)(\s*)//) { token('whitespace', $1); token('comma', ',') if $2; token('whitespace', $3) if $3; } return unless $line; # Comment? if (substr($line, 0, 1) eq "#") { token('comment', $line); return; # Quoted argument? } elsif ($line =~ s/^(".*?(?<!\\)")(,?)//) { token('double_quoted_string', $1); token('comma', ',') if $2; } elsif ($line =~ s/^('.*?(?<!\\)')(,?)//) { token('single_quoted_string', $1); token('comma', ',') if $2; # Must be a none-quoted argument } elsif ($line =~ s/^(\.?\$?[\[\]+\-a-z0-9_]+)(,?)//i) { token('argument', $1); token('comma', ',') if $2; # Something has gone wrong } else { die ("*$line* -- weird\n"); } } } sub token { my $type = shift; my $data = shift; push(@tokens, [ $type, $data ]); } 1;