I started to write a lexer in Python -- my first attempt to do something
useful with Python (rather than trying out snippets from tutorials). It
is not complete yet, but I would like some feedback -- I'm a Python
newbie and it seems that, with Python, there is always a simpler and
better way to do it than you think.
### Begin ###
import re
class Lexer(object):
def __init__( self, source, tokens ):
self.source = re.sub( r"\r?\n|\r\n", "\n", source )
self.tokens = tokens
self.offset = 0
self.result = []
self.line = 1
def _compile( self ):
for name, regex in self.tokens.iteritems():
self.tokens[name] = re.compile( regex, re.M )
def _tokenize( self ):
while self.offset < len( self.source ):
for name, regex in self.tokens.iteritems():
match = regex.match( self.source, self.offset )
if not match: continue
self.offset += len( match.group(0) )
self.result.append( ( name, match, self.line ) )
self.line += match.group(0).count( "\n" )
raise Exception(
'Syntax error in source at offset %s' %
str( self.offset ) )
def __str__( self ):
return "\n".join(
[ "[L:%s]\t[O:%s]\t[%s]\t'%s'" %
( str( line ), str( match.pos ), name, match.group(0) )
for name, match, line in self.result ] )
# Test Example
source = r"""
Name: "Thomas", # just a comment
Age: 37
tokens = {
'T_IDENTIFIER' : r'[A-Za-z_][A-Za-z0-9_]*',
'T_NUMBER' : r'[+-]?\d+',
'T_STRING' : r'"(?:\\.|[^\\"])*"',
'T_OPERATOR' : r'[=:,;]',
'T_NEWLINE' : r'\n',
'T_LWSP' : r'[ \t]+',
'T_COMMENT' : r'(?:\#|//).*$' }
print Lexer( source, tokens )
### End ###
Ce n'est pas parce qu'ils sont nombreux à avoir tort qu'ils ont raison!