Hello All, How do I make a class for retrieving all forms and input fields on a web page. Meaning, form name, form url,all input fields, text,textarea,select,etc...I have something currently and it kinda works. It also kinda works for retrieving all images on webpages. My main concer here is to make is retrive forms and input fields into a dict that has this kinda of outline: Maybe not exactly like this, but something to this nature. So I can use it in other classes.
forms= {"name":{"url":{"""input_name1":"","input_name_2","select_input":{"value1":"","value2":"selected","value3":""}}}} This is what I have made a long time ago, I haven't touched it in a while, but I need to make it work with some other classes. I need tit to receive all forms and inputs for me. What is a more efficient way to do this? My background is in PHP and I want to extend my python knowledge. I came up with this script by Googling something a long time ago and making modifications and additions to it. You can find the stdout_colours class on line, JFGI. It's allows for display of caller and called function and colors output to terminal. Thanks for your help my friends. #!/usr/bin/python import urllib, urllib2 import re, sys,string import os, sys, Image from sgmllib import SGMLParser from urlparse import urlparse import stdout_colours class URLPacker(SGMLParser): def __init__(self,colorize,caller): self.colorize=colorize self.caller=caller self.reset() def reset(self): SGMLParser.reset(self) self.func_me_color="white_on_black" self.soc=stdout_colours.stdout_colors(self.colorize,self.caller) self.soc.me_him(['ENTER:',__name__],self.func_me_color) self.urls = {} self.imgs = {} self.forms = {} self.inputs = {} self.action = "" self.method = "" self.url="" self.path="" self.source="" self.dirname="" self.level=0 self.max_depth=4 self.urlRExp = re.compile('[.]htm$|[.]html$|[.]shtml$|[/]$|[.]php$', re.IGNORECASE) self.fileRExp = re.compile('[\/:*?"<>|]') self.formats=[] self.soc.me_him(['EXIT:',__name__],self.func_me_color) def start_a(self,attrs): self.soc.me_him(['ENTER:',__name__],self.func_me_color) self.soc.w(attrs,"red") href = [v for k,v in attrs if k=='href'] for value in href: if self.urls.has_key(value): pass else: self.urls[value]=0 self.soc.me_him(['EXIT:',__name__],self.func_me_color) def start_img(self,attrs): self.soc.me_him(['ENTER:',__name__],self.func_me_color) self.soc.w(attrs,"blue") src = [v for k,v in attrs if k=="src"] for value in src: if self.imgs.has_key(value): pass else: self.imgs[value]=0 self.soc.me_him(['EXIT:',__name__],self.func_me_color) def start_form(self,attrs): self.soc.me_him(['ENTER:',__name__],self.func_me_color) self.soc.w(attrs,"green") method = [v for k, v in attrs if k=='method'] action = [v for k,v in attrs if k=="action"] if string.join(method,"") != "": self.method=method[0] else: self.method="post" self.action=action[0] self.soc.w(action,"white_on_green") self.soc.w(method,"white_on_blue") self.forms[self.action]={} self.forms[self.action][self.method]={} self.soc.me_him(['EXIT:',__name__],self.func_me_color) def start_input(self,attrs): self.soc.me_him(['ENTER:',__name__],self.func_me_color) self.soc.w(attrs,"yellow") name = [v for k, v in attrs if k=='name'] value = [v for k,v in attrs if k=="value"] if string.join(name,"") !="": if string.join(value,"")!="": self.forms[self.action][self.method][name[0]]=value[0] else: self.forms[self.action][self.method][name[0]]="" self.soc.w(self.forms,"white_on_gold") self.soc.me_him(['EXIT:',__name__],self.func_me_color) def url_dirname(self, url): self.soc.me_him(['RETURN:',__name__],self.func_me_color) #print url return self.fileRExp.sub('_',url) def dirname(self,url): self.soc.me_him(['RETURN:',__name__],self.func_me_color) return os.path.dirname(url) def save_images(self,minsize): if os.path.isdir(self.path+self.dirname): return False # aborting, dir exists else: a=self.path+self.dirname os.mkdir(a) os.chdir(a) print self.url for img in self.imgs.keys(): loc=self.url_dirname(img) print loc, img, a try: if self.imgs[img]==0: #urllib.urlretrieve(img,loc) #os.system('wget -A.jpg,gif,png,wmv,avi,mpg -r -l4 -H -erobots=off --wait=1 -np -U \"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.6) Gecko/20050405 Firefox/1.0 (Ubuntu package 1.0.2)\" '+img) #im = Image.open(loc) #if im.size[0]<minsize or im.size[1] < minsize: #print "removed",img,loc #os.remove(loc) #else: #print "image saved", img,loc self.imgs[img]=1 except IOError,e: self.soc.w(["save_images IOERROR",IOError,e,"img: ",img,"loc: ",loc,"path: ",a],"white_on_red") for img in self.urls.keys(): #if img.find(".jpg")!=-1: loc=self.url_dirname(img) print loc, img, a #try: # if self.urls[img]==0: # continue #os.system('wget -A.jpg,gif,png,wmv,avi,mpg -r -l4 -H -erobots=off --wait=1 -np -U \"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.6) Gecko/20050405 Firefox/1.0 (Ubuntu package 1.0.2)\" '+img) #urllib.urlretrieve(img,loc) #im = Image.open(loc) #if im.size[0]<minsize or im.size[1] < minsize: #print "removed",img,loc #os.remove(loc) #else: #print "image saved", img,loc #self.urls[img]=1 #except IOError,e: # self.soc.w(["save_images IOERROR",IOError,e,"img: ",img,"loc: ",loc,"path: ",a],"white_on_red") #os.system('wget -r -l1 -A jpg -U \"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.6) Gecko/20050405 Firefox/1.0 (Ubuntu package 1.0.2)\" -nd --wait=2 --random-wait --no-parent -nv ' + self.url) #if self.url.find("jpg")==-1: # pass #else: #urllib.urlretrieve(self.url,string.join(string.split(self.url,"/"),"")) os.chdir("..") #print "done saving to "+path+dirname #return True def convert_to_absolute_urls(self): urls = {} for url in self.urls: if os.path.dirname(self.url) =="http:": self.url=self.url+"/" else: self.url=os.path.dirname(self.url)+"/" print urlparse(url), self.url if urlparse(url)[0].find("http") == -1: if urlparse(url)[0].find("script") == -1: urls[self.url+url]=self.urls[url] else: urls[url]=self.urls[url] self.urls = urls def convert_to_absolute_imgs(self): print "here" imgs = {} for img in self.imgs: if os.path.dirname(self.url) =="http:": self.url=self.url+"/" else: self.url=os.path.dirname(self.url)+"/" print urlparse(img), self.url if urlparse(img)[0].find("http") == -1: if urlparse(img)[0].find("script") == -1: imgs[self.url+img]=self.imgs[img] else: imgs[img]=self.imgs[img] print "IMAGES: ",imgs self.imgs = imgs def process_url(self,url,path,level,max_depth): #self.reset() self.soc.me_him(['ENTER:',__name__],self.func_me_color) if path[-1:] !="/": path=path+"/" self.url = url self.path = path self.soc.w(["url: ",url],"blue") self.dirname = self.url_dirname(self.url) self.soc.w(["dirname: ",self.dirname],"red") if os.path.isdir(self.path): self.soc.w(["path: ",self.path,"LEVEL:",level],"green") if os.path.isdir(self.path+self.dirname): print "ABORT dir already exists: "+ self.path+self.dirname return False else: try: if self.url.find(".jpg")==-1: if self.urls[self.url]==0: sock = urllib.urlopen(self.url) self.source = sock.read() self.feed(self.source) self.urls[self.url]=1 sock.close() self.close() self.convert_to_absolute_urls() self.convert_to_absolute_imgs() print "urls: ",self.soc.w(self.urls,"white_on_red") print "forms: ",self.soc.w(self.forms,"white_on_blue") print "imgs: ",self.soc.w(self.imgs,"white_on_green") print "SAVING IMAGES",self.url #self.save_images(250) for i in self.urls.keys(): if self.urls.has_key(i): pass #self.process_url(i,self.path,level+1,max_depth) else: self.soc.w(["BEEN DONE",self.url],"white_on_gold") else: urllib.urlretrieve(self.url,self.path) except IOError,e: self.soc.w(["process_url IOERROR",IOError,e],"white_on_red") #return False else: self.soc.w(["Incorrect Path:", self.path],"white_on_red") self.soc.me_him(['EXIT:',__name__],self.func_me_color) if __name__ == "__main__": path=sys.argv[1] url=sys.argv[2] colorize=sys.argv[3] caller=sys.argv[4] func_me_color="white_on_black" soc=stdout_colours.stdout_colors(colorize,caller) soc.me(['ENTER:',__name__],func_me_color) max_depth=4 level=0 up=URLPacker(colorize,caller) up.urls[url]=0 up.process_url(url,path,level,max_depth) soc.me(['EXIT:',__name__],func_me_color) -- А-Б-В-Г-Д-Е-Ё-Ж-З-И-Й-К-Л-М-Н-О-П-Р-С-Т-У-Ф-Х-Ц-Ч-Ш-Щ-Ъ-Ы-Ь-Э-Ю-Я а-б-в-г-д-е-ё-ж-з-и-й-к-л-м-н-о-п-р-с-т-у-ф-х-ц-ч-ш-щ-ъ-ы-ь-э-ю-я
-- http://mail.python.org/mailman/listinfo/python-list