Yaşar Arabacı wrote: > I am using a simple sublclass of HTMLParser like this: > > class LinkCollector(HTMLParser): > > def reset(self): > self.links = [] > HTMLParser.reset(self) > > def handle_starttag(self,tag,attr): > if tag in ("a","link"): > key = "href" > elif tag in ("img","script"): > key = "src" > else: > return > self.links.extend([v for k,v in attr if k == key]) > > This gives following error: > > Traceback (most recent call last): > File "downloader.py", line 209, in <module> > if __name__ == "__main__": main() > File "downloader.py", line 201, in main > link_collect.feed(response) > File "C:\Python27\lib\HTMLParser.py", line 108, in feed > self.goahead(0) > File "C:\Python27\lib\HTMLParser.py", line 148, in goahead > k = self.parse_starttag(i) > File "C:\Python27\lib\HTMLParser.py", line 252, in parse_starttag > attrvalue = self.unescape(attrvalue) > File "C:\Python27\lib\HTMLParser.py", line 393, in unescape > return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, > s) > File "C:\Python27\lib\re.py", line 151, in sub > return _compile(pattern, flags).sub(repl, string, count) > UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 13: > ordinal not in range(128)
Trying to reproduce the error: >>> from HTMLParser import HTMLParser >>> class P(HTMLParser): ... def handle_starttag(self, tag, attrs): ... key, value = attrs[0] ... print tag, key, "=", value ... >>> def feed(s): ... P().feed(s) ... >>> feed("<a href='yadda'/>") a href = yadda >>> feed("<a href='ä yadda'/>") a href = ä yadda >>> feed("<a href='ä ä'/>") Traceback (most recent call last): File "<stdin>", line 1, in <module> File "<stdin>", line 2, in feed File "/usr/local/lib/python2.7/HTMLParser.py", line 108, in feed self.goahead(0) File "/usr/local/lib/python2.7/HTMLParser.py", line 148, in goahead k = self.parse_starttag(i) File "/usr/local/lib/python2.7/HTMLParser.py", line 252, in parse_starttag attrvalue = self.unescape(attrvalue) File "/usr/local/lib/python2.7/HTMLParser.py", line 390, in unescape return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) File "/usr/local/lib/python2.7/re.py", line 151, in sub return _compile(pattern, flags).sub(repl, string, count) UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 1: ordinal not in range(128) It seems that the exception is triggered by an attribute value that contains both entities and non-ascii bytes. >>> feed(u"<a href='ä ä'/>") a href = ä ä > Rest of the code available as attachment. Does anyone know how to solve > this? The documentation doesn't mention unicode, but it seems to work anyway: >>> feed(u"<a href='ä ä'/>") a href = ä ä So one fix might be to convert the data to unicode before passing it to the HTMLParser. -- http://mail.python.org/mailman/listinfo/python-list