Hi,
Can someone explain me, what is wrong with this site ?

python linkExtractor3.py http://www.noticiasdeaveiro.pt > test

HTMLParser.HTMLParseError: EOF in middle of construct, at line 1173,
column 1

at line 1173 of test file is perfectly normal .

I like to know what I have to clean up before parse the html page 
I send in attach the python code .

thanks in advance 
-- 
Sérgio M. B.
import urllib
import urlparse
import re
from HTMLParser import HTMLParser, HTMLParseError

class ParserExtractor(HTMLParser):
    def __init__(self, base, content):
        HTMLParser.__init__(self)
    
        self.__content  = content

        self.__base  = base
        self.__links = []

    def links(self):
        self.feed(self.__content)
        self.close()
        return self.__links
    
    def handle_starttag(self, tag, attr):
        attr = dict(attr)

        if 'a' == tag:
            self.start_a(attr)
        if 'base' == tag:
            self.start_base(attr)

    def start_a(self, attr):
        l = attr.get('href')
        if l:
            self.__links.append( urlparse.urljoin(self.__base, l) )

    def start_base(self, attr):
        l = attr.get('href')
        if l:
            self.__base = l

def getLinks(url):
    content = None
    base = url
    content = urllib.urlopen(url).read(-1)
    # clean scripts and comments
    p = re.compile('<script.*?script>',re.S|re.I)
    content = p.sub('', content)
    p = re.compile('<!--.*?-->',re.S)
    content = p.sub('', content)
    print content

    links = []
    parser = ParserExtractor(url, content)
    links  = parser.links()
        
    return links

if __name__ == '__main__':
    from sys import argv

    for l in getLinks(argv[1]):
        print l

Attachment: smime.p7s
Description: S/MIME cryptographic signature

-- 
http://mail.python.org/mailman/listinfo/python-list

Reply via email to