On Jun 12, 4:27 pm, Phillip B Oldham <[EMAIL PROTECTED]> wrote: > I'm keen on learning python, with a heavy lean on doing things the > "pythonic" way, so threw the following script together in a few hours > as a first-attempt in programming python. > > I'd like the community's thoughts/comments on what I've done; > improvements I can make, "don'ts" I should be avoiding, etc. I'm not > so much bothered about the resulting data - for the moment it meets my > needs. But any comment is welcome! > > #!/usr/bin/env python > ## Open a file containing a list of domains (1 per line), > ## request and parse it's whois record and push to a csv > ## file. > > import subprocess > import re > > src = open('./domains.txt') > > dest = open('./whois.csv', 'w'); > > sep = "|" > headers = ["Domain","Registrant","Registrant's > Address","Registrar","Registrant Type","Date Registered","Renewal > Date","Last Updated","Name Servers"] > > dest.write(sep.join(headers)+"\n") > > def trim( txt ): > x = [] > for line in txt.split("\n"): > if line.strip() == "": > continue > if line.strip().startswith('WHOIS'): > continue > if line.strip().startswith('>>>'): > continue > if line.strip().startswith('%'): > continue > if line.startswith("--"): > return ''.join(x) > x.append(" "+line) > return "\n".join(x) > > def clean( txt ): > x = [] > isok = re.compile("^\s?([^:]+): ").match > for line in txt.split("\n"): > match = isok(line) > if not match: > continue > x.append(line) > return "\n".join(x); > > def clean_co_uk( rec ): > rec = rec.replace('Company number:', 'Company number -') > rec = rec.replace("\n\n", "\n") > rec = rec.replace("\n", "") > rec = rec.replace(": ", ":\n") > rec = re.sub("([^(][a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec) > rec = rec.replace(":\n", ": ") > rec = re.sub("^[ ]+\n", "", rec) > return rec > > def clean_net( rec ): > rec = rec.replace("\n\n", "\n") > rec = rec.replace("\n", "") > rec = rec.replace(": ", ":\n") > rec = re.sub("([a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec) > rec = rec.replace(":\n", ": ") > return rec > > def clean_info( rec ): > x = [] > for line in rec.split("\n"): > x.append(re.sub("^([^:]+):", "\g<0> ", line)) > return "\n".join(x) > > def record(domain, record): > details = ['','','','','','','','',''] > for k, v in record.items(): > try: > details[0] = domain.lower() > result = { > "registrant": lambda: 1, > "registrant name": lambda: 1, > "registrant type": lambda: 4, > "registrant's address": lambda: 2, > "registrant address1": lambda: 2, > "registrar": lambda: 3, > "sponsoring registrar": lambda: 3, > "registered on": lambda: 5, > "registered": lambda: 5, > "domain registeration date": lambda: 5, > "renewal date": lambda: 6, > "last updated": lambda: 7, > "domain last updated date": lambda: 7, > "name servers": lambda: 8, > "name server": lambda: 8, > "nameservers": lambda: 8, > "updated date": lambda: 7, > "creation date": lambda: 5, > "expiration date": lambda: 6, > "domain expiration date": lambda: 6, > "administrative contact": lambda: 2 > }[k.lower()]() > if v != '': > details[result] = v > except: > continue > > dest.write(sep.join(details)+"\n") > > ## Loop through domains > for domain in src: > > domain = domain.strip() > > if domain == '': > continue > > rec = subprocess.Popen(["whois",domain], > stdout=subprocess.PIPE).communicate()[0] > > if rec.startswith("No whois server") == True: > continue > > if rec.startswith("This TLD has no whois server") == True: > continue > > rec = trim(rec) > > if domain.endswith(".net"): > rec = clean_net(rec) > > if domain.endswith(".com"): > rec = clean_net(rec) > > if domain.endswith(".tv"): > rec = clean_net(rec) > > if domain.endswith(".co.uk"): > rec = clean_co_uk(rec) > > if domain.endswith(".info"): > rec = clean_info(rec) > > rec = clean(rec) > > details = {} > > try: > for line in rec.split("\n"): > bits = line.split(': ') > a = bits.pop(0) > b = bits.pop(0) > details[a.strip()] = b.strip().replace("\t", ", ") > except: > continue > > record(domain, details) > > ## Cleanup > src.close() > dest.close()
Just a few quick things before I leave work. #!/usr/bin/env python """Open a file containing a list of domains (1 per line), request and parse it's whois record and push to a csv file. """ # Rather use docstrings than multiline commenting like that. def trim(txt): x = [] for line in txt.splitlines(): # Strings have a built in function if not line.strip() or line.startswith('WHOIS') \ or line.startswith('>>>') or line.startswith('%'): continue # you can do them in one if statement if line.startswith('--'): return ''.join(x) x.append(' '+line) return '\n'.join(x) for domain in src: if not domain.strip(): continue # A line with nothing is False rec = subprocess.Popen(["whois",domain.strip()], stdout=subprocess.PIPE).communicate()[0] if rec.startswith('No whois server') \ or rec.startswith('This TLD has no whois server'): continue # Startswith will return True/False so it is enough rec = trim(rec) if domain.endswith('.net'): rec = clean_net(rec) elif domain.endswith('.com'): # Rather use if/elif statements unless somehow you think you will match more than one. .... for line in rec.splitlines(): try: a, b = line.split(': ')[:2] details[a.strip()] = b.strip().replace('\t', ', ') except IndexError: # No matches continue Hope that's a start. -- http://mail.python.org/mailman/listinfo/python-list