Hi, Can someone explain me, what is wrong with this site ? python linkExtractor3.py http://www.noticiasdeaveiro.pt > test
HTMLParser.HTMLParseError: EOF in middle of construct, at line 1173, column 1 at line 1173 of test file is perfectly normal . I like to know what I have to clean up before parse the html page I send in attach the python code . thanks in advance -- Sérgio M. B.
import urllib import urlparse import re from HTMLParser import HTMLParser, HTMLParseError class ParserExtractor(HTMLParser): def __init__(self, base, content): HTMLParser.__init__(self) self.__content = content self.__base = base self.__links = [] def links(self): self.feed(self.__content) self.close() return self.__links def handle_starttag(self, tag, attr): attr = dict(attr) if 'a' == tag: self.start_a(attr) if 'base' == tag: self.start_base(attr) def start_a(self, attr): l = attr.get('href') if l: self.__links.append( urlparse.urljoin(self.__base, l) ) def start_base(self, attr): l = attr.get('href') if l: self.__base = l def getLinks(url): content = None base = url content = urllib.urlopen(url).read(-1) # clean scripts and comments p = re.compile('<script.*?script>',re.S|re.I) content = p.sub('', content) p = re.compile('<!--.*?-->',re.S) content = p.sub('', content) print content links = [] parser = ParserExtractor(url, content) links = parser.links() return links if __name__ == '__main__': from sys import argv for l in getLinks(argv[1]): print l
smime.p7s
Description: S/MIME cryptographic signature
-- http://mail.python.org/mailman/listinfo/python-list