Hello all,
I have a question. I guess this worked pre 2.6; I don't remember the last time I used it, but it was a while ago, and now it's failing. Anyone mind looking at it and telling me what's going wrong? Also, is there a quick way to match on a certain site? like links from google.com and only output those?
#!/usr/bin/env python

#This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published #by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

#This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License along with this program. If not, see
#http://www.gnu.org/licenses/.

"""
This script will parse out all the links in an html document and write them to a textfile.
"""
import sys,optparse
import htmllib,formatter

#program class declarations:
class Links(htmllib.HTMLParser):
    def __init__(self,formatter):
        htmllib.HTMLParser.__init__(self, formatter)
        self.links=[]
    def start_a(self, attrs):
        if (len(attrs)>0):
            for a in attrs:
                if a[0]=="href":
                    self.links.append(a[1])
                    print a[1]
                    break

def main(argv):
    if (len(argv)!=3):
print("Error:\n"+argv[0]+" <input> <output>.\nParses <input> for all links and saves them to <output>.")
        return 1
    lcount=0
    format=formatter.NullFormatter()
    html=Links(format)
    print "Retrieving data:"
    page=open(argv[1],"r")
    print "Feeding data to parser:"
    html.feed(page.read())
    page.close()
    print "Writing links:"
    output=open(argv[2],"w")
    for i in (html.links):
        output.write(i+"\n")
        lcount+=1
    output.close()
    print("Wrote "+str(lcount)+" links to "+argv[2]+".");
    print("done.")

if (__name__ == "__main__"):
#we call the main function passing a list of args, and exit with the return code passed back.
    sys.exit(main(sys.argv))

--

Thanks,
Ty

--
http://mail.python.org/mailman/listinfo/python-list

Reply via email to