My goal is to write a parser for these imaginary string from the SMTP protocol, regarding RFC 821 and 1869. I'm a little flexible with the BNF from these RFC :-) Any comment ?
tests=[ 'MAIL FROM:<[EMAIL PROTECTED]>', 'MAIL FROM:[EMAIL PROTECTED]', 'MAIL FROM:<[EMAIL PROTECTED]> SIZE=1234 [EMAIL PROTECTED]', 'MAIL FROM:[EMAIL PROTECTED] SIZE=1234 [EMAIL PROTECTED]', 'MAIL FROM:<"[EMAIL PROTECTED]> legal=email"@address.com>', 'MAIL FROM:"[EMAIL PROTECTED]> legal=email"@address.com', 'MAIL FROM:<"[EMAIL PROTECTED]> legal=email"@address.com> SIZE=1234 [EMAIL PROTECTED]', 'MAIL FROM:"[EMAIL PROTECTED]> legal=email"@address.com SIZE=1234 [EMAIL PROTECTED]', ] def RN(name, regex): """protect using () and give an optional name to a regex""" if name: return r'(?P<%s>%s)' % (name, regex) else: return r'(?:%s)' % regex regex={} # <dotnum> ::= <snum> "." <snum> "." <snum> "." <snum> regex['dotnum']=RN(None, r'[012]?\d?\d\.[012]?\d?\d\.[012]?\d?\d\. [012]?\d?\d' % regex) # <dot-string> ::= <string> | <string> "." <dot-string> regex['dot_string']=RN(None, r'[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)*' % regex) # <domain> ::= <element> | <element> "." <domain> regex['domain']=RN('domain', r'%(dotnum)s|%(dot_string)s' % regex) # <q> ::= any one of the 128 ASCII characters except <CR>, <LF>, quote ("), or backslash (\) regex['q']=RN(None, r'[^\n\r"\\]' % regex) # <x> ::= any one of the 128 ASCII characters (no exceptions) regex['x']=RN(None, r'.' % regex) # <qtext> ::= "\" <x> | "\" <x> <qtext> | <q> | <q> <qtext> regex['qtext']=RN(None, r'(?:\\%(x)s|%(q)s)+' % regex) # <quoted-string> ::= """ <qtext> """ regex['quoted_string']=RN('quoted_string', r'"%(qtext)s"' % regex) # <local-part> ::= <dot-string> | <quoted-string> regex['local_part']=RN('local_part', r'%(quoted_string)s|% (dot_string)s' % regex) # <mailbox> ::= <local-part> "@" <domain> regex['mailbox']=RN('mailbox', r'%(local_part)[EMAIL PROTECTED](domain)s' % regex) # <path> ::= "<" [ <a-d-l> ":" ] <mailbox> ">" # also accept address without <> regex['path']=RN('path', r'(?P<path_lt><)?%(mailbox)s(?(path_lt)>)' % regex) # esmtp-keyword ::= (ALPHA / DIGIT) *(ALPHA / DIGIT / "-") regex['esmtp_keyword']=RN(None, r'[a-zA-Z0-9][-a-zA-Z0-9]*' % regex) # esmtp-value ::= 1*<any CHAR excluding "=", SP, and all ; syntax and values depend on esmtp-keyword # control characters (US ASCII 0-31inclusive)> regex['esmtp_value']=RN(None, r'[^= \t\r\n\f\v]*' % regex) # esmtp-parameter ::= esmtp-keyword ["=" esmtp-value] regex['esmtp_parameter']=RN(None, r'%(esmtp_keyword)s(?:=% (esmtp_value)s)?' % regex) # esmtp-parameter ::= esmtp-keyword ["=" esmtp-value] regex['esmtp_parameters']=RN('esmtp_parameters', r'% (esmtp_parameter)s(?:\s+%(esmtp_parameter)s)+' % regex) # esmtp-cmd ::= inner-esmtp-cmd [SP esmtp-parameters] CR LF regex['esmtp_addr']=RN('esmtp_addr', r'%(path)s(?:\s+% (esmtp_parameters)s)?' % regex) for t in tests: for keyword in [ 'MAIL FROM:', 'RCPT TO:' ]: keylen=len(keyword) if t[:keylen].upper()==keyword: t=t[keylen:] break match=re.match(regex['esmtp_addr'], t) if match: print 'MATCH local_part=%(local_part)s domain=%(domain)s esmtp_parameters=%(esmtp_parameters)s' % match.groupdict() else: print 'DONT match', t -- http://mail.python.org/mailman/listinfo/python-list