Parrot Tokenizer

Peter Sergeant Sun, 18 Aug 2002 04:28:45 -0700

If anyone is interested, I've hacked together a Parrot tokenizer ... the 
main intention for it was to form the basis of a syntax highlighter, but 
I'm sure someone will find more creative applications for it...


I hacked together a very quick and dirty syntax highlighter (about 10 
lines) using the Tokenizer, and the results of running on 
examples/assembly/life.pasm can be found here:

http://grou.ch/parrot.html

+Pete

# Parrot Tokenizer in Perl

# Copyright 2002 Peter Sergeant <[EMAIL PROTECTED]>

package Parrot::Tokenizer;

use strict;

my @tokens;

# This whole storing a global @tokens thing is a design fault ... I didn't
# think about

sub tokenize {
        
        # Grab incoming data
        my $buffer = shift;
        
        # Clear the token cache
        @tokens = ();
        
        my @lines = split(/(?<=\n)/, $buffer);
        
        foreach my $line (@lines) {
        
                # First deal with preceeding white-space
                if ($line =~ s/^(\s+)(\S)/$2/) {
                        token('whitespace', $1);
                }
        
                # Comments
                if (substr($line, 0, 1) eq "#") {
                        token('comment', $line) 
        
                # Blank lines
                } elsif ($line =~ m/^\s+$/) {
                        token('whitespace', $line)
                
                # Labels
                } elsif ($line =~ s/^(\$?[a-z0-9_]+\:)//i) {
                        token('label', $1);
                
                        # Trim any preceeding whitespace
                        if ($line =~ s/^(\s+)//) {
                                token('whitespace', $1);
                        }
                
                        # Check to see if we're a comment line 
                        if (substr($line, 0, 1) eq "#") {
                                token('comment', $line);
                
                        # So we have a function...
                        } elsif ($line =~ s/^([a-z0-9_]+)//i) {
                                token('operation', $1);
                                arguments( $line )
                        }
                
                # operation
                } elsif ($line =~ s/^([a-z0-9_]+)//i) {
                        token('operation', $1);
                        arguments($line);
                }       
        
        }       
        
        return @tokens;
        
}

# Process operation arguments
sub arguments {
        
        my $line = shift;
        
        while ($line) {
        
                # Whitespace checks
                if ($line =~ s/^(\s+)(,?)(\s*)//) {
                        token('whitespace', $1);
                        token('comma', ',') if $2;
                        token('whitespace', $3) if $3;
                }
        
                return unless $line;
        
                # Comment?
                if (substr($line, 0, 1) eq "#") {
                        token('comment', $line); return;        
                
                # Quoted argument?
                } elsif ($line =~ s/^(".*?(?<!\\)")(,?)//) {
                        token('double_quoted_string', $1);
                        token('comma', ',') if $2;

                } elsif ($line =~ s/^('.*?(?<!\\)')(,?)//) {
                        token('single_quoted_string', $1);
                        token('comma', ',') if $2;              
                
                # Must be a none-quoted argument
                } elsif ($line =~ s/^(\.?\$?[\[\]+\-a-z0-9_]+)(,?)//i) {
                        token('argument', $1);
                        token('comma', ',') if $2;
                
                # Something has gone wrong
                } else {
                        die ("*$line* -- weird\n");
                }
        }
        
}

sub token {

        my $type = shift;
        my $data = shift;
        
        push(@tokens, [ $type, $data ]);
        
}

1;

Parrot Tokenizer

Reply via email to