I'm keen on learning python, with a heavy lean on doing things the "pythonic" way, so threw the following script together in a few hours as a first-attempt in programming python.
I'd like the community's thoughts/comments on what I've done; improvements I can make, "don'ts" I should be avoiding, etc. I'm not so much bothered about the resulting data - for the moment it meets my needs. But any comment is welcome! #!/usr/bin/env python ## Open a file containing a list of domains (1 per line), ## request and parse it's whois record and push to a csv ## file. import subprocess import re src = open('./domains.txt') dest = open('./whois.csv', 'w'); sep = "|" headers = ["Domain","Registrant","Registrant's Address","Registrar","Registrant Type","Date Registered","Renewal Date","Last Updated","Name Servers"] dest.write(sep.join(headers)+"\n") def trim( txt ): x = [] for line in txt.split("\n"): if line.strip() == "": continue if line.strip().startswith('WHOIS'): continue if line.strip().startswith('>>>'): continue if line.strip().startswith('%'): continue if line.startswith("--"): return ''.join(x) x.append(" "+line) return "\n".join(x) def clean( txt ): x = [] isok = re.compile("^\s?([^:]+): ").match for line in txt.split("\n"): match = isok(line) if not match: continue x.append(line) return "\n".join(x); def clean_co_uk( rec ): rec = rec.replace('Company number:', 'Company number -') rec = rec.replace("\n\n", "\n") rec = rec.replace("\n", "") rec = rec.replace(": ", ":\n") rec = re.sub("([^(][a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec) rec = rec.replace(":\n", ": ") rec = re.sub("^[ ]+\n", "", rec) return rec def clean_net( rec ): rec = rec.replace("\n\n", "\n") rec = rec.replace("\n", "") rec = rec.replace(": ", ":\n") rec = re.sub("([a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec) rec = rec.replace(":\n", ": ") return rec def clean_info( rec ): x = [] for line in rec.split("\n"): x.append(re.sub("^([^:]+):", "\g<0> ", line)) return "\n".join(x) def record(domain, record): details = ['','','','','','','','',''] for k, v in record.items(): try: details[0] = domain.lower() result = { "registrant": lambda: 1, "registrant name": lambda: 1, "registrant type": lambda: 4, "registrant's address": lambda: 2, "registrant address1": lambda: 2, "registrar": lambda: 3, "sponsoring registrar": lambda: 3, "registered on": lambda: 5, "registered": lambda: 5, "domain registeration date": lambda: 5, "renewal date": lambda: 6, "last updated": lambda: 7, "domain last updated date": lambda: 7, "name servers": lambda: 8, "name server": lambda: 8, "nameservers": lambda: 8, "updated date": lambda: 7, "creation date": lambda: 5, "expiration date": lambda: 6, "domain expiration date": lambda: 6, "administrative contact": lambda: 2 }[k.lower()]() if v != '': details[result] = v except: continue dest.write(sep.join(details)+"\n") ## Loop through domains for domain in src: domain = domain.strip() if domain == '': continue rec = subprocess.Popen(["whois",domain], stdout=subprocess.PIPE).communicate()[0] if rec.startswith("No whois server") == True: continue if rec.startswith("This TLD has no whois server") == True: continue rec = trim(rec) if domain.endswith(".net"): rec = clean_net(rec) if domain.endswith(".com"): rec = clean_net(rec) if domain.endswith(".tv"): rec = clean_net(rec) if domain.endswith(".co.uk"): rec = clean_co_uk(rec) if domain.endswith(".info"): rec = clean_info(rec) rec = clean(rec) details = {} try: for line in rec.split("\n"): bits = line.split(': ') a = bits.pop(0) b = bits.pop(0) details[a.strip()] = b.strip().replace("\t", ", ") except: continue record(domain, details) ## Cleanup src.close() dest.close() -- http://mail.python.org/mailman/listinfo/python-list