I am trying to modify code from a web crawler to scrape for keywords from certain websites. However, Im trying to run the web crawler before I modify it, and I'm running into issues.
When I ran this code - *import threading* *from Queue import Queue* *from spider import Spider* *from domain import get_domain_name* *from general import file_to_set* *PROJECT_NAME = "SPIDER"* *HOME_PAGE = "https://www.cracked.com/ <https://www.cracked.com/>"* *DOMAIN_NAME = get_domain_name(HOME_PAGE)* *QUEUE_FILE = '/home/me/research/queue.txt'* *CRAWLED_FILE = '/home/me/research/crawled.txt'* *NUMBER_OF_THREADS = 1* *#Captialize variables and make them class variables to make them const variables* *threadqueue = Queue()* *Spider(PROJECT_NAME,HOME_PAGE,DOMAIN_NAME)* *def crawl():* * change = file_to_set(QUEUE_FILE)* * if len(change) > 0:* * print str(len(change)) + 'links in the queue'* * create_jobs()* *def create_jobs():* * for link in file_to_set(QUEUE_FILE):* * threadqueue.put(link) #.put = put item into the queue* * threadqueue.join()* * crawl()* *def create_spiders():* * for _ in range(NUMBER_OF_THREADS): #_ basically if you dont want to act on the iterable* * vari = threading.Thread(target = work)* * vari.daemon = True #makes sure that it dies when main exits* * vari.start()* *#def regex():* * #for i in files_to_set(CRAWLED_FILE):* * #reg(i,LISTS) #MAKE FUNCTION FOR REGEX# i is url's, LISTs is list or set of keywords* *def work():* * while True:* * url = threadqueue.get()# pops item off queue* * Spider.crawl_pages(threading.current_thread().name,url)* * threadqueue.task_done()* *create_spiders()* *crawl()* That used this class: *from HTMLParser import HTMLParser* *from urlparse import urlparse* *class LinkFinder(HTMLParser):* * def _init_(self, base_url,page_url):* * super()._init_()* * self.base_url= base_url* * self.page_url = page_url* * self.links = set() #stores the links* * def error(self,message):* * pass* * def handle_starttag(self,tag,attrs):* * if tag == 'a': # means a link* * for (attribute,value) in attrs:* * if attribute == 'href': #href relative url i.e not having www* * url = urlparse.urljoin(self.base_url,value)* * self.links.add(url)* * def return_links(self):* * return self.links()* And this spider class: *from urllib import urlopen #connects to webpages from python* *from link_finder import LinkFinder* *from general import directory, text_maker, file_to_set, conversion_to_set* *class Spider():* * project_name = 'Reader'* * base_url = ''* * Queue_file = ''* * crawled_file = ''* * queue = set()* * crawled = set()* * def __init__(self,project_name, base_url,domain_name):* * Spider.project_name = project_name* * Spider.base_url = base_url* * Spider.domain_name = domain_name* * Spider.Queue_file = '/home/me/research/queue.txt'* * Spider.crawled_file = '/home/me/research/crawled.txt'* * self.boot()* * self.crawl_pages('Spider 1 ', base_url)* * @staticmethod * * def boot():* * directory(Spider.project_name)* * text_maker(Spider.project_name,Spider.base_url)* * Spider.queue = file_to_set(Spider.Queue_file)* * Spider.crawled = file_to_set(Spider.crawled_file)* * @staticmethod * * def crawl_pages(thread_name, page_url):* * if page_url not in Spider.crawled:* * print thread_name + 'crawling ' + page_url* * print 'queue' + str(len(Spider.queue)) + '|crawled' + str(len(Spider.crawled))* * Spider.add_links_to_queue(Spider.gather_links(page_url))* * Spider.crawled.add(page_url)* * Spider.update_files()* * @staticmethod* * def gather_links(page_url):* * html_string = ''* * try:* * response = urlopen(page_url)* * if 'text/html' in response.getheader('Content Type'):* * read = response.read()* * html_string = read.decode('utf-8')* * finder = LinkFinder(Spider.base_url,page_url)* * finder.feed(html_string)* * except:* * print 'Error: cannot crawl page'* * return set()* * return finder.return_links()* * @staticmethod* * def add_links_to_queue(links):* * for i in links:* * if i in Spider.queue:* * continue* * if i in Spider.crawled:* * continue* * # if Spider.domain_name != get_domain_name(url):* * # continue* * Spider.queue.add()* * @staticmethod* * def update_files():* * conversion_to_set(Spider.queue,Spider.Queue_file)* * conversion_to_set(Spider.crawled,Spider.crawled_file*) and these functions: *from urlparse import urlparse* *#get subdomain name (name.example.com <http://name.example.com>)* *def subdomain_name(url):* * try:* * return urlparse(url).netloc* * except:* * return ''* *def get_domain_name(url):* * try:* * variable = subdomain_name.split(',')* * return variable[-2] + ',' + variable[-1] #returns 2nd to last and last instances of variable* * except:* * return '''* (there are more functions, but those are housekeeping functions) The interpreter returned this error: *RuntimeError: maximum recursion depth exceeded while calling a Python object* After calling crawl() and create_jobs() a bunch of times? How can I resolve this? Thanks _______________________________________________ Tutor maillist - Tutor@python.org To unsubscribe or change subscription options: https://mail.python.org/mailman/listinfo/tutor