Attached is a essence of my crawler. This collects <a> tag in a given URL
HTML parsing is not a big deal as "tidy" does all for you. It converts a broken HTML to a valid XHTML. From that point there're wealth of XML libraries. Just write whatever you want such as <a> element handler. I've extended it for multi-thread, limit the number of thread for a specific web host, more flexible element handling, etc, etc. SQLite is nice for making URL db by the way. Kenji Noguchi
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys, urllib, urllib2, cookielib import xml.dom.minidom, tidy from urlparse import urlparse, urljoin _ua = "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja; rv:1.8.1.12) Gecko/20080201 Firefox/2.0.0.12" # I'm not sure if CookieJar() is thread safe cj = cookielib.CookieJar() class SingleCrawler: def __init__(self, seed_url=None): self.seed_url = seed_url self.urls = {} # static def _convert(self, html): if isinstance(html, unicode): html = html.encode('utf-8') options = dict( doctype='strict', drop_proprietary_attributes=True, enclose_text=True, output_xhtml=True, wrap=0, char_encoding='utf8', newline='LF', tidy_mark=False, ) return str(tidy.parseString(html, **options)) def _collect_urls(self, node, nest=0): if node.nodeType == 1 and node.nodeName == 'a': href = node.getAttribute('href') if not href.startswith('#'): p = urlparse(href) if p.scheme in ('', 'http', 'https'): self.urls[node.getAttribute('href')] = True else: # mailto, javascript print p.scheme for i in node.childNodes: self._collect_urls(i, nest+1) def canonicalize(self): d = {} for url in self.urls: d[urljoin(self.seed_url, url).encode('ascii')] = True self.urls = d def crawl(self): opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) opener.addheaders = [('User-agent', _ua)] try: html = opener.open(self.seed_url).read() except urllib2.HTTPError, e: return None except urllib2.URLError, e: print "URL Error:", self.seed_url return None if html.startswith('<?xml'): # destroy xhtml ;-) html = html[html.index('?>')+2:] html = self._convert(html) try: dom = xml.dom.minidom.parseString(html) except ExpatError, e: print "ExpatError:", html return None self._collect_urls(dom.childNodes[1]) self.canonicalize() return self.urls.keys() if __name__=='__main__': crawler = SingleCrawler() crawler.seed_url = 'http://www.python.org' next_urls = crawler.crawl() print next_urls
-- http://mail.python.org/mailman/listinfo/python-list