Hi, Can someone explain me, what is wrong with this site ? python linkExtractor3.py http://www.noticiasdeaveiro.pt > test
HTMLParser.HTMLParseError: EOF in middle of construct, at line 1173, column 1 at line 1173 of test file is perfectly normal . I like to know what I have to clean up before parse the html page I send in attach the python code . thanks in advance -- Sérgio M. B.
import urllib
import urlparse
import re
from HTMLParser import HTMLParser, HTMLParseError
class ParserExtractor(HTMLParser):
def __init__(self, base, content):
HTMLParser.__init__(self)
self.__content = content
self.__base = base
self.__links = []
def links(self):
self.feed(self.__content)
self.close()
return self.__links
def handle_starttag(self, tag, attr):
attr = dict(attr)
if 'a' == tag:
self.start_a(attr)
if 'base' == tag:
self.start_base(attr)
def start_a(self, attr):
l = attr.get('href')
if l:
self.__links.append( urlparse.urljoin(self.__base, l) )
def start_base(self, attr):
l = attr.get('href')
if l:
self.__base = l
def getLinks(url):
content = None
base = url
content = urllib.urlopen(url).read(-1)
# clean scripts and comments
p = re.compile('<script.*?script>',re.S|re.I)
content = p.sub('', content)
p = re.compile('<!--.*?-->',re.S)
content = p.sub('', content)
print content
links = []
parser = ParserExtractor(url, content)
links = parser.links()
return links
if __name__ == '__main__':
from sys import argv
for l in getLinks(argv[1]):
print l
smime.p7s
Description: S/MIME cryptographic signature
-- http://mail.python.org/mailman/listinfo/python-list
