using pyparsing to deal with nested tables , wanna keep table's structure and propertys . but program was chunked with the </td> tag of inner table.
have any ideas? here's the program from pyparsing import * mytable = """ <table id="leftpage_table" width="156" border="0" cellspacing="0" cellpadding="0"> <tr id="trtd" height="24"> <td width="153" background="images/bt_kind.gif" align="center" class="left_menu">system</td> </tr> <tr id="trtd_down" height="20"> <td id="trtd_down"><table id="inner_lefgpage_table" width="100%" height="100%" border="0" cellspacing="0" cellpadding="0"> <tr id="inner_trtd" height="20"> <td background="images/bt_class.gif" align="center">art</td> </tr> <tr> <td background="images/bt_class.gif" align="center">art</td> </tr> </table></td> </tr> </table> """ startTag = Literal("<") endTag = Literal(">") idPattern = CaselessLiteral("id").suppress() + Literal("=").suppress() + ( quotedString.copy().setParseAction( removeQuotes ) | Word(srange("[a-zA-Z0-9_~]"))) attrPattern = Combine(Word(alphanums + "_") + Literal("=") + ( quotedString | Word(srange("[a-zA-Z0-9_~:&@#;?/\.]")))) tablePattern = Forward() def getItemCloseTag(x): itemCloseTag = Combine(startTag + Literal("/") + CaselessLiteral(x) + endTag).suppress() return itemCloseTag def getItemStartTag(x): itemStartTag = startTag.suppress() + Keyword(x,caseless=True).suppress() + Group(ZeroOrMore(idPattern)) + Group(ZeroOrMore(attrPattern)) + endTag.suppress() return itemStartTag def getItemPattern(x): tCloseTag = getItemCloseTag(x) itemPattern = getItemStartTag(x) + Group(ZeroOrMore(tablePattern)) + Group(SkipTo(tCloseTag)) + tCloseTag return itemPattern def getMultiLevelPattern(x,y): tCloseTag = getItemCloseTag(x) itemPattern = getItemStartTag(x) + Group(OneOrMore(y)) + tCloseTag return itemPattern tdPattern = getItemPattern(x='td') trPattern = getMultiLevelPattern('tr',tdPattern) tablePattern = getMultiLevelPattern('table',trPattern) t = tablePattern for toks,strt,end in t.scanString(mytable): print toks.asList() OutPut: [['leftpage_table'], ['width="156"', 'border="0"', 'cellspacing="0"', 'cellpadding="0"'], [['trtd'], ['height="24"'], [[], ['width="153"', 'background="images/bt_kind.gif"', 'align="center"', 'class="left_menu"'], [], ['system']], ['trtd_down'], ['height="20"'], [['trtd_down'], [], [], ['<table id="inner_lefgpage_table" width="100%" height="100%" border="0" cellspacing="0" cellpadding="0">\n <tr id="inner_trtd" height="20">\n <td background="images/bt_class.gif" align="center">art']], [], [], [[], ['background="images/bt_class.gif"', 'align="center"'], [], ['art']]]] -- http://mail.python.org/mailman/listinfo/python-list