Hi, I have written this script to run as a cron that will loop through a text file with a list of urls. It works fine for most of the links, however there are a number of urls which are subdomains (they are government sites) such as http://basename.airforce.mil, these links are always throwing 400 errors even though the site exists.
Is there a way to get around this? Here is the script: import httplib from urlparse import urlparse class LinkChecker: def oldStuff(): p = urlparse(url) h = HTTP(p[1]) h.putrequest('HEAD', p[2]) h.endheaders() if h.getreply()[0] == 200: return 1 else: return 0 def check(self): print "\nLooping through the file, line by line." # define default values for the paremeters text_file = open("/home/jjaffe/pythonModules/JAMRSscripts/urls.txt", "r") output = "" errors = "=================== ERRORS (website exists but 404, 503 etc ): ===================\n" failures= "\n=================== FAILURES (cannot connect to website at all): ===================\n" eCount = 0 fCount = 0 #loop through each line and see what the response code is for line in text_file: p = urlparse(line) try: conn = httplib.HTTPConnection(p[1]) conn.request("GET", p[2]) r1 = conn.getresponse() if r1.status != 200: #if the response code was not success (200) then report the error errors += "\n "+str(r1.status)+" error for: "+p[1]+p[2] eCount = (eCount + 1) data1 = r1.read() conn.close() except: #the connection attempt timed out - hence the website doesn't even exist failures +="\n Could not create connection object: "+p[1]+p[2] fCount = (fCount + 1) text_file.close() #see if there were errors and create output string if (eCount == 0) and (fCount == 0): output = "No errors or failures to report" else: output = errors+"\n\n"+failures print output if __name__ == '__main__': lc = LinkChecker() lc.check() del lc Thanks in advance. -- http://mail.python.org/mailman/listinfo/python-list