Hello All, I am doing web stuff first time in python so I am looking for suggestions. I wrote this code to download the title of webpages using as much less resource ( server time, data download) as possible and should be fast enough. Initially I used BeautifulSoup for parsing but the person who is going to use this code asked me not to use this and use regular expressions ( The reason was BeautifulSoup is not fast enough ? ). Also initially, I was downloading the the whole page but finally I restricted to only 30000 characters to get the title of almost all the pages. Write now I can see only two shortcomings of this code, one when I kill the code by SIGINT ( ctrl-c ) then it dies instantly. I can modify this code to process all the elements in queue and let it die. The second is one IO call per iteration in download url function ( May be I can use async IO call but I am not sure ). I don't have much web programming experience so I am looking for suggestion to make it more robust. top-1m.csv is file downloaded from alexa[1]. Also some suggestions to write more idiomatic python code.
-Mukesh Tiwari [1]http://www.alexa.com/topsites. import urllib2, os, socket, Queue, thread, signal, sys, re class Downloader(): def __init__( self ): self.q = Queue.Queue( 200 ) self.count = 0 def downloadurl( self ) : #open a file in append mode and write the result ( Improvement think of writing in chunks ) with open('titleoutput.dat', 'a+' ) as file : while True : try : url = self.q.get( ) data = urllib2.urlopen ( url , data = None , timeout = 10 ).read( 30000 ) regex = re.compile('<title.*>(.*?)</title>' , re.IGNORECASE) #Read data line by line and as soon you find the title go out of loop. #title = None #for r in data: # if not r : # raise StopIteration # else: # title = regex.search( r ) # if title is not None: break title = regex.search( data ) result = ', '.join ( [ url , title.group(1) ] ) #data.close() file.write(''.join( [ result , '\n' ] ) ) except urllib2.HTTPError as e: print ''.join ( [ url, ' ', str ( e ) ] ) except urllib2.URLError as e: print ''.join ( [ url, ' ', str ( e ) ] ) except Exception as e : print ''.join ( [ url, ' ', str( e ) ] ) #With block python calls file.close() automatically. def createurl ( self ) : #check if file exist. If not then create one with default value of 0 bytes read. if os.path.exists('bytesread.dat'): f = open ( 'bytesread.dat','r') self.count = int ( f.readline() ) else: f=open('bytesread.dat','w') f.write('0\n') f.close() #Reading data in chunks is fast but we can miss some sites due to reading the data in chunks( It's worth missing because reading is very fast) with open('top-1m.csv', 'r') as file: prefix = '' file.seek( self.count * 1024 ) #you will land into the middle of bytes so discard upto newline if ( self.count ): file.readline() for lines in iter ( lambda : file.read( 1024 ) , ''): l = lines.split('\n') n = len ( l ) l[0] = ''.join( [ prefix , l[0] ] ) for i in xrange ( n - 1 ) : self.q.put ( ''.join ( [ 'http://www.', l[i].split(',')[1] ] ) ) prefix = l[n-1] self.count += 1 #do graceful exit from here. def handleexception ( self , signal , frame) : with open('bytesread.dat', 'w') as file: print ''.join ( [ 'Number of bytes read ( probably unfinished ) ' , str ( self.count ) ] ) file.write ( ''.join ( [ str ( self.count ) , '\n' ] ) ) file.close() sys.exit(0) if __name__== '__main__': u = Downloader() signal.signal( signal.SIGINT , u.handleexception) thread.start_new_thread ( u.createurl , () ) for i in xrange ( 5 ) : thread.start_new_thread ( u.downloadurl , () ) while True : pass -- http://mail.python.org/mailman/listinfo/python-list