Berend van Berkum wrote:
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1


Hi everyone,

I read the source, made numerous tests, but SGMLParser's keeps returning *tag* data from previous parser instances. I'm totally confused why.. The content data it
returns is ok.

E.g.::

    sp = MyParser()
    sp.feed('<test><t />Test</test>')
    print sp.content, sp.markup
    sp.close()

    sp = MyParser()
    sp.feed('<xml>\n</xml>\r\n')
    print sp.content, sp.markup
    sp.close()

gives::

('Test', [{'t': ({}, (0, 0))}, {'test': ({}, (0, 4))}]) ('\n\r\n', [{'t': ({}, (0, 0))}, {'test': ({}, (0, 4))}, {'xml': ({}, (0, 1))}])

It keeps the tags from the previous session, while i'm sure the stack etc.
should be clean..

Any ideas?


regards, Berend

- ----

import sgmllib


class MyParser(sgmllib.SGMLParser):

        content = ''            
        markup = []
        span_stack = []

These are in the _class_ itself, so they will be shared by all its
instances. You should so something like this instead:

        def __init__(self):
                self.content = ''
                self.markup = []
                self.span_stack = []

        def handle_data(self, data):
                self.content += data

        def unknown_starttag(self, tag, attr):
                stack = { tag: ( dict(attr), ( len(self.content), ) ) }
                self.span_stack.append(stack)

        def unknown_endtag(self, tag):
                prev_tag, ( attr, ( offset, ) ) = 
self.span_stack.pop().items()[0]

                if tag:
                        # close all tags on stack until it finds a matching end 
tag
                        # XXX: need to return to LEVEL, not same tag name
                        while tag != prev_tag:
                                span = { prev_tag: ( attr, ( offset, 0 ) ) }
                                self.markup.append( span )

                                prev_tag, ( attr, ( offset, ) ) = 
self.span_stack.pop().items()[0]

                length = len( self.content ) - offset
                span = { tag: ( attr, ( offset, length ) ) }
                self.markup.append( span )

        def do_unknown_tag(self, tag, attr):
                assert not tag and not attr, "do_unknown_tag %s, %s" % (tag, 
attr)

        def close(self):
                sgmllib.SGMLParser.close(self)
                self.content = ''
                self.markup = []
                self.span_stack = []                                            


def parse_data(data):
        sp = MyParser()
        sp.feed(data)
        r = sp.content, sp.markup
        sp.close()
        return r

print parse_data('<test><t />Test</test>')
print parse_data('<xml>\n</xml>\r\n')
print parse_data('<sgml><s>Test 3</s></sgml>')


--
http://mail.python.org/mailman/listinfo/python-list

Reply via email to