I'm working on a basic web spider, and I'm having problems with the urlparser. This is the effected function: ------------------------------ def FindLinks(Website): WebsiteLen = len(Website)+1 CurrentLink = '' i = 0 SpliceStart = 0 SpliceEnd = 0
LinksString = "" LinkQueue = open('C:/LinkQueue.txt', 'a') while (i < WebsiteLen) and (i != -1): #Debugging info #print '-----' #print 'Length = ' + str(WebsiteLen) #print 'SpliceStart = ' + str(SpliceStart) #print 'SpliceEnd = ' + str(SpliceEnd) #print 'i = ' + str(i) SpliceStart = Website.find('<a href="', (i+1)) SpliceEnd = (Website.find('">', SpliceStart)) ParsedURL = urlparse((Website[SpliceStart+9:(SpliceEnd+1)])) robotparser.set_url(ParsedURL.hostname + '/' + 'robots.txt') robotparser.read() if (robotparser.can_fetch("*", (Website[SpliceStart+9:(SpliceEnd+1)])) == False): i = i - 1 else: LinksString = LinksString + "\n" + (Website[SpliceStart+9:(SpliceEnd+1)]) LinksString = LinksString[:(len(LinksString) - 1)] #print 'found ' + LinksString i = SpliceEnd LinkQueue.write(LinksString) LinkQueue.close() ------------------------------ Sorry if it's uncommented. When I run my program, I get this error: ----- Traceback (most recent call last): File "C:/Documents and Settings/Andrew/Desktop/ScoutCode-0.09.py", line 120, in <module> FindLinks(Website) File "C:/Documents and Settings/Andrew/Desktop/ScoutCode-0.09.py", line 84, in FindLinks robotparser.read() File "C:\Program Files\Python25\lib\robotparser.py", line 61, in read f = opener.open(self.url) File "C:\Program Files\Python25\lib\urllib.py", line 190, in open return getattr(self, name)(url) File "C:\Program Files\Python25\lib\urllib.py", line 451, in open_file return self.open_local_file(url) File "C:\Program Files\Python25\lib\urllib.py", line 465, in open_local_file raise IOError(e.errno, e.strerror, e.filename) IOError: [Errno 2] The system cannot find the path specified: 'en.wikipedia.org\\robots.txt' Note the last line 'en.wikipedia.org\\robots.txt'. I want 'en.wikipedia.org/robots.txt'! What am I doing wrong? If this has been answered before, please just give me a link to the proper thread. If you need more contextual code, I can post more. -- http://mail.python.org/mailman/listinfo/python-list