Berend van Berkum wrote:
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Hi everyone,
I read the source, made numerous tests, but SGMLParser's keeps returning *tag* data
from previous parser instances. I'm totally confused why.. The content data it
returns is ok.
E.g.::
sp = MyParser()
sp.feed('<test><t />Test</test>')
print sp.content, sp.markup
sp.close()
sp = MyParser()
sp.feed('<xml>\n</xml>\r\n')
print sp.content, sp.markup
sp.close()
gives::
('Test', [{'t': ({}, (0, 0))}, {'test': ({}, (0, 4))}])
('\n\r\n', [{'t': ({}, (0, 0))}, {'test': ({}, (0, 4))}, {'xml': ({}, (0, 1))}])
It keeps the tags from the previous session, while i'm sure the stack etc.
should be clean..
Any ideas?
regards, Berend
- ----
import sgmllib
class MyParser(sgmllib.SGMLParser):
content = ''
markup = []
span_stack = []
These are in the _class_ itself, so they will be shared by all its
instances. You should so something like this instead:
def __init__(self):
self.content = ''
self.markup = []
self.span_stack = []
def handle_data(self, data):
self.content += data
def unknown_starttag(self, tag, attr):
stack = { tag: ( dict(attr), ( len(self.content), ) ) }
self.span_stack.append(stack)
def unknown_endtag(self, tag):
prev_tag, ( attr, ( offset, ) ) =
self.span_stack.pop().items()[0]
if tag:
# close all tags on stack until it finds a matching end
tag
# XXX: need to return to LEVEL, not same tag name
while tag != prev_tag:
span = { prev_tag: ( attr, ( offset, 0 ) ) }
self.markup.append( span )
prev_tag, ( attr, ( offset, ) ) =
self.span_stack.pop().items()[0]
length = len( self.content ) - offset
span = { tag: ( attr, ( offset, length ) ) }
self.markup.append( span )
def do_unknown_tag(self, tag, attr):
assert not tag and not attr, "do_unknown_tag %s, %s" % (tag,
attr)
def close(self):
sgmllib.SGMLParser.close(self)
self.content = ''
self.markup = []
self.span_stack = []
def parse_data(data):
sp = MyParser()
sp.feed(data)
r = sp.content, sp.markup
sp.close()
return r
print parse_data('<test><t />Test</test>')
print parse_data('<xml>\n</xml>\r\n')
print parse_data('<sgml><s>Test 3</s></sgml>')
--
http://mail.python.org/mailman/listinfo/python-list