Hi all. In order to understand the concept of threading pool in python I'm working on a simple single-site web crawler. I would like to stop the program when the threading pool have downloaded all internal links from a web site, but now my program keep waiting forever even if there are no more links to download.
Here's my code, I appreciate any comments, I'm programming just for fun and learning ;-) Thanks in advance. from BeautifulSoup import BeautifulSoup import urllib from pprint import pprint import string from urlparse import urlparse import sys from threading import Thread import time from Queue import Queue #dirty hack: set default encoding to utf-8 reload(sys) sys.setdefaultencoding('utf-8') opener = urllib.FancyURLopener({}) class Crawler: def __init__(self): """ Constructor """ self.missed = 0 self.url_list = [] self.urls_queue = Queue() self.num_threads = 5 self._create_threads() def get_internal_links(self,url): """ Get all internal links from a web page and feed the queue """ self.url = url url_netloc = urlparse(self.url).netloc print "Downloading... ", self.url time.sleep(5) try: p = opener.open(self.url) #print p.info() except IOError: print "error connecting to ", self.url print "wait..." time.sleep(5) print "retry..." try: p = urllib.urlopen(self.url) except IOError: self.missed = self.missed + 1 return None html = p.read() soup = BeautifulSoup(html) anchors = soup.findAll('a') links = [ str(anchor['href']) for anchor in anchors] internal_links = [link for link in links if (urlparse(link).netloc == url_netloc)] for link in internal_links: if link not in self.url_list and link != self.url: self.url_list.append(link) self.urls_queue.put(link) print "Queue size: ", self.urls_queue.qsize() print "List size: ", str(len(self.url_list)) print "Errors: ", str(self.missed) self._queue_consumer() def _queue_consumer(self): """ Consume the queue """ while True: url = self.urls_queue.get() print 'Next url: ', url self.get_internal_links(url) self.urls_queue.task_done() def _create_threads(self): """ Set up some threads to fetch pages """ for i in range(self.num_threads): worker = Thread(target=self._queue_consumer, args=()) worker.setDaemon(True) worker.start() #----------------------------------------------------------------------------- # if __name__ == '__main__': c = Crawler() c.get_internal_links('http://www.thinkpragmatic.net/') -- http://mail.python.org/mailman/listinfo/python-list