My first Python program -- a lexer

Thomas Mlynarczyk Sat, 08 Nov 2008 13:00:44 -0800

Hello,

I started to write a lexer in Python -- my first attempt to do somethinguseful with Python (rather than trying out snippets from tutorials). Itis not complete yet, but I would like some feedback -- I'm a Pythonnewbie and it seems that, with Python, there is always a simpler andbetter way to do it than you think.


### Begin ###

import re

class Lexer(object):
    def __init__( self, source, tokens ):
        self.source = re.sub( r"\r?\n|\r\n", "\n", source )
        self.tokens = tokens
        self.offset = 0
        self.result = []
        self.line   = 1
        self._compile()
        self._tokenize()

    def _compile( self ):
        for name, regex in self.tokens.iteritems():
            self.tokens[name] = re.compile( regex, re.M )

    def _tokenize( self ):
        while self.offset < len( self.source ):
            for name, regex in self.tokens.iteritems():
                match = regex.match( self.source, self.offset )
                if not match: continue
                self.offset += len( match.group(0) )
                self.result.append( ( name, match, self.line ) )
                self.line += match.group(0).count( "\n" )
                break
            else:
                raise Exception(
                    'Syntax error in source at offset %s' %
                    str( self.offset ) )

    def __str__( self ):
        return "\n".join(
            [ "[L:%s]\t[O:%s]\t[%s]\t'%s'" %
              ( str( line ), str( match.pos ), name, match.group(0) )
              for name, match, line in self.result ] )

# Test Example

source = r"""
    Name: "Thomas", # just a comment
    Age: 37
"""

tokens = {
    'T_IDENTIFIER' : r'[A-Za-z_][A-Za-z0-9_]*',
    'T_NUMBER'     : r'[+-]?\d+',
    'T_STRING'     : r'"(?:\\.|[^\\"])*"',
    'T_OPERATOR'   : r'[=:,;]',
    'T_NEWLINE'    : r'\n',
    'T_LWSP'       : r'[ \t]+',
    'T_COMMENT'    : r'(?:\#|//).*$' }

print Lexer( source, tokens )

### End ###


Greetings,
Thomas

--
Ce n'est pas parce qu'ils sont nombreux à avoir tort qu'ils ont raison!
(Coluche)
--
http://mail.python.org/mailman/listinfo/python-list

My first Python program -- a lexer

Reply via email to