here is a code which crawls links sent to it. theres some problem with the retrieve_url function ,plz help me out in debugging the fuction retrive_url. This function retrives pages and saves them in file #TODO:Visited dict grows in size it needs to be handled smartly #Moreover server program needs to be in sync with the client eg. Myrobot #Take care of tag - 'if modified since',repeated links,hash links #This is the client side of the distributed crawling framework #It gets the list of urls to be crawled #Then crawls the urls and stores the pages in a temporary archive #which is then transferred to the server or grey_matter import httplib import os import sys import urlparse import urllib2 import urllib import zipfile import threading
from socket import * PAGE_DIR="C:/users/jayesh/ pages/" # directory where the web pages are stored temporarily # before transfer to the grey_matter visited= {} # a dict to remember visited urls ROBOT_COUNT=4 def fget(): """ This function retrieves the zipped file containing the list of urls from the grey_matter and saves them in a local file 'list.txt'. """ httplib.HTTPConnection.debuglevel=1 request=urllib2.Request('http://192.168.153.57/list.zip') #Requesting the zipped file request.add_header('Accept-encoding','gzip') #containing the list of urls opener=urllib2.build_opener() flag=1 s='Waiting for server' while flag==1: try: op=opener.open(request) flag=0 except: s=s+'*' print s f=open('list.zip',"wb") f.write(op.read()) f.close() z=zipfile.ZipFile('list.zip') p=z.namelist() g=open('list.txt',"wb") g.write(z.read(p[0])) g.close() print 'got zipped file' def compress(): """ This function compresses the crawled pages and stores them in a single compressed file ready to be sent to the grey_matter.""" zfile=zipfile.ZipFile('C:/xampp/htdocs/pages.zip',mode='w') for fil in os.listdir(PAGE_DIR): full=os.path.join(PAGE_DIR,fil) zfile.write(full,fil) os.remove(full) os.rmdir(PAGE_DIR) #Removing the directory after transfer to grey_matter x=0 class robot(threading.Thread): """ The main robot class which does the crawling of listed urls it recieves from the grey matter. It uses 3 threads which crawl the listed urls synchronously.""" def __init__(self,urllist,urllistlock,dblock): threading.Thread.__init__(self) self.urllist=urllist self.urllistlock=urllistlock self.dblock=dblock def popurl(self): """ This method pops out urls from the urls file one by one and sends them for retrieval.""" self.urllistlock.acquire(1) if(len(self.urllist)<1): Nexturl=None else: Nexturl=self.urllist[0] if Nexturl[-1]=='\n':Nexturl=Nexturl[:-1] del self.urllist[0] self.urllistlock.release() return Nexturl def retrieve_url(self,url): """ The main method of the robot class and is called run method to retrieve the given urls from the web.""" global x if url is not None: try: if visited.has_key(url): return pieces=urlparse.urlparse(url) filepath=pieces[2] if filepath != '': filepath=filepath[1:] filename=filepath.split("/")[-1] else: filename=x+'.htm' x+=1 path=os.path.join(PAGE_DIR,filename) url=urlparse.urlunparse(pieces) p=url.rfind('#') #temporary if p!=-1: url=url[:p] visited[url]=1 m=urllib2.urlopen(url) fopen=open(path,'wb') fopen.seek(0) fopen.write(url+'|') fopen.write(m.read()) fopen.close() print url ,'retrieved' except IOError: print url print "ERROR:OOPS! THE URL CAN'T BE RETRIEVED" return def run(self): while(1): url=self.popurl() if url is None: break try: self.retrieve_url(url) except:sys.exit() if __name__=='__main__': s=socket(AF_INET,SOCK_STREAM) s.bind(('',444)) s.listen(5) q,v=s.accept() count=1 print 'Connecting...' while 1: print 'Phase: %s' %(count) message=q.recv(3) if(message!='yes'):continue print 'Connected' count=count+1 fget() # Calling the fget method to get the url list from # grey_matter(server). try: os.mkdir(PAGE_DIR) except: print 'Cant make dir' try: f=open('list.txt','r') urllist=f.readlines() f.close() except: print 'Error opening urls file' sys.exit() print 'startting threads' urllistlock=threading.Lock() dblock=threading.Lock() botlist=[] for X in range(0,ROBOT_COUNT): newbot=robot(urllist,urllistlock,dblock) newbot.setName('X') botlist.append(newbot) newbot.start() for X in range(0,ROBOT_COUNT): botlist[X].join() compress() try: q.send('yes') except: print 'socket disconnected' print sys.exit() -- http://mail.python.org/mailman/listinfo/python-list