Hi All, I've picked up the PyParsing module and am trying to figure out how to do a simple parsing of some HTML source code. My specific problem is dealing with an <TD></TD> element that is blank.
from pyparsing import * import sys integer = Word("0123456789") trStart = Literal("<TR>").suppress() trEnd = Literal("</TR>").suppress() tdStart = Literal("<TD>").suppress() tdEnd = Literal("</TD>").suppress() #dataItem = Word(alphas) BlankItem = Word('') dataItem = Word(alphanums + " " + "," + ":") # works with spaces in data MultiItem = Optional(OneOrMore(dataItem)) TestLine = ['<TR><TD>Group</TD><TD>Year</TD><TD>City</TD></TR>', '<TR><TD>AAA</TD><TD>1992</TD><TD>Los Angeles</TD></TR>', '<TR><TD>BBB</TD><TD>2007</TD><TD>Santa Cruz</TD></TR>', '<TR><TD></TD><TD>2001</TD><TD>Santa Cruz</TD></TR>'] htmlLine = trStart + tdStart + MultiItem.setResultsName('status') + tdEnd + tdStart + MultiItem.setResultsName('year') + tdEnd + tdStart + MultiItem.setResultsName('title') + tdEnd + trEnd for CurrentLine in TestLine: print 'Line = ', CurrentLine for srvrtokens,startloc,endloc in htmlLine.scanString( CurrentLine ): print 'tokens = %s %d %d \n' % (srvrtokens, startloc,endloc) Output : Line = <TR><TD>Group</TD><TD>Year</TD><TD>City</TD></TR> tokens = ['Group', 'Year', 'City'] 0 49 Line = <TR><TD>AAA</TD><TD>1992</TD><TD>Los Angeles</TD></TR> tokens = ['AAA', '1992', 'Los Angeles'] 0 54 Line = <TR><TD>BBB</TD><TD>2007</TD><TD>Santa Cruz</TD></TR> tokens = ['BBB', '2007', 'Santa Cruz'] 0 53 *** Blank 1st element - only shows 2 elements - need 3 elements to be consistent *** Line = <TR><TD></TD><TD>2001</TD><TD>Santa Cruz</TD></TR> tokens = ['2001', 'Santa Cruz'] 0 50 Any assistance would be greatly appreciated! Steve -- http://mail.python.org/mailman/listinfo/python-list