Attached is a essence of my crawler.  This collects <a> tag in a given URL

HTML parsing is not a big deal as "tidy" does all for you. It converts
a broken HTML
to a valid XHTML.  From that point there're wealth of XML libraries. Just write
whatever you want such as <a> element handler.

I've extended it for multi-thread, limit the number of thread for a
specific web host,
more flexible element handling, etc, etc. SQLite is nice for making URL db
by the way.

Kenji Noguchi
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys, urllib, urllib2, cookielib
import xml.dom.minidom, tidy
from urlparse import urlparse, urljoin

_ua = "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja; rv:1.8.1.12) Gecko/20080201 Firefox/2.0.0.12"

# I'm not sure if CookieJar() is thread safe
cj = cookielib.CookieJar()

class SingleCrawler:
    def __init__(self, seed_url=None):
        self.seed_url = seed_url
        self.urls = {}

    # static
    def _convert(self, html):
        if isinstance(html, unicode):
            html = html.encode('utf-8')
        options = dict(
            doctype='strict',
            drop_proprietary_attributes=True,
            enclose_text=True,
            output_xhtml=True,
            wrap=0,
            char_encoding='utf8',
            newline='LF',
            tidy_mark=False,
            )
        return str(tidy.parseString(html, **options))

    def _collect_urls(self, node, nest=0):
        if node.nodeType == 1 and node.nodeName == 'a':
            href = node.getAttribute('href')
            if not href.startswith('#'):
                p = urlparse(href)
                if p.scheme in ('', 'http', 'https'):
                    self.urls[node.getAttribute('href')] = True
                else:
                    # mailto, javascript
                    print p.scheme
                
        for i in node.childNodes:
            self._collect_urls(i, nest+1)

    def canonicalize(self):
        d = {}
        
        for url in self.urls:
            d[urljoin(self.seed_url, url).encode('ascii')] = True
        self.urls = d
        
    def crawl(self):
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
        opener.addheaders = [('User-agent', _ua)]
        try:
            html = opener.open(self.seed_url).read()
        except urllib2.HTTPError, e:
            return None
        except urllib2.URLError, e:
            print "URL Error:", self.seed_url
            return None
        if html.startswith('<?xml'):
            # destroy xhtml ;-)
            html = html[html.index('?>')+2:]
            
        html = self._convert(html)
        try:
            dom = xml.dom.minidom.parseString(html)
        except ExpatError, e:
            print "ExpatError:", html
            return None
        
        self._collect_urls(dom.childNodes[1])
        self.canonicalize()
        return self.urls.keys()

if __name__=='__main__':
    crawler = SingleCrawler()
    crawler.seed_url = 'http://www.python.org'
    next_urls = crawler.crawl()
    print next_urls
   

-- 
http://mail.python.org/mailman/listinfo/python-list

Reply via email to