Lex Hider wrote: > Hi, > Apologies if this is against etiquette. I've just got my first python app up > and running. It is a podcast aggregator depending on feedparser. I've really > only learnt enough to get this up and running. > > Any tips on the code quality and use of python would be appreciated. I've got > a feeling the overall structure is up the creek. > approx 220 LOC. > file: GodCast.py > > #!/usr/bin/python (snip) > # TODO: not found log http://docs.python.org/lib/module-logging.html
> # TODO: > # config file http://docs.python.org/lib/module-ConfigParser.html > # opml feed list? > # pygtk/pyqt/qtkde gui? > > # possible flags: test, print but don't actual do anything http://docs.python.org/lib/module-optparse.html > import re, feedparser, os, sys, shutil, time, getopt > import urllib2 > import urllib > import md5 > > boz = "" > HOME = os.path.expanduser("~") > # user configurable > #maxChecksPerDay = 8 > #maxChecksPerDay = 12 > maxChecksPerDay = 24 > myTemp = '/tmp' http://docs.python.org/lib/module-tempfile.html BTW, note that the most common naming convention in Python is all_lower_with_underscore (except for ClasseName). > #podDir = os.path.join(HOME, 'Audio/Podcasts') > podDir = os.path.join(HOME, 'Podcasts') > # end user configurable > downDir = os.path.join(myTemp, 'Podcasts') > dotDir = os.path.join(HOME, '.aGodCast') > logFile = os.path.join(dotDir, 'log') #list of downloaded urls > cacheDir = os.path.join(dotDir, 'cache') > ignoreNotFound = False # if true, add files not found to log > # list of feeds, ignore lines not beginning ^http > feedList = os.path.join(dotDir, 'feeds.txt') > > > def exitFunc(): > #f.close() > #log.close() > if boz: > print boz > > > def makeDirs(*dirs): > for dir in dirs: > if not os.path.exists(dir): > os.makedirs(dir) > > > # render is used because feeds use a lot of html, not just plain text. > def render(html): > if html: > html = re.sub('"', '\\"', html.encode('utf8')) > #command = 'echo "' + html + '" | w3m -dump -T text/html' > #command = 'echo "' + html + '" | html2text' > command = 'echo "' + html + '" | lynx -dump -stdin -force_html' another way : command = 'echo "%s" | lynx -dump -stdin -force_html' % html > os.system(command) > > > def localMD5(url): > hash = md5.new(url).hexdigest() + '.xml' #unique name from url > return os.path.join(cacheDir, hash) > > > def cache(url): > max = 60 * 60 * 24 / maxChecksPerDay #seconds > myfile = localMD5(url) > if os.path.isfile(myfile): > elapsed = int(time.time()) - os.path.getmtime(myfile) > if elapsed <= max: > return > print "FETCHING:", url + ' ...' Note that stdout is usually meant for normal program outputs (so one can pipe programs). Error reporting and verbosities should go to stderr > urllib.urlretrieve(url, myfile) > # handle half finish? > > > def updateCache(feeds): > l = [] > print "updating local xml cache..." > for feed in file(feeds, "r").read().split('\n'): > if not re.match('^http://', feed): # feedList ignores anything but > urls > continue > # TODO: handle whitespace, strip trailing > cache(feed) > l.append([localMD5(feed), feed]) > print "cache up to date" > return l > > > def geturl(url): > try: > redir = urllib2.urlopen(url).geturl() > except urllib2.HTTPError, e: > if e.code != 404: > print url > print "geturl HTTPError:", e.code > return e.code > except urllib2.URLError, e: > # (110, 'Connection timed out') > print e.reason > #print "geturl URLError:", e.code > else: > return redir > return 0 I'm afraid you didn't get the point of exception handling - it's meant to free function results from error code. Your above code totally defeats this - as is obvious when reading the calling code. > redirect = geturl(url) # TRAFFIC > if type(redirect) != int: #success do_something_useful_here() > elif redirect == 404: > print 'NOT FOUND:', url > if ignoreNotFound: > print '\tWILL NO LONGER ATTEMPT TO DOWNLOAD\n' > log(url) > else: > sys.exit(2) may I suggest a rewrite : try: redirect = urllib2.urlopen(url).geturl() except urllib2.HTTPError, e: if e.code == 404: print 'NOT FOUND:', url if ignoreNotFound: print '\tWILL NO LONGER ATTEMPT TO DOWNLOAD\n' log(url) else: print "geturl HTTPError %s on url %s" % (e.code, url) raise except urllib2.URLError, e: print "geturl URLError %s on url %s" % (e.reason, url) else: do_something_useful_here() > def htmlTitle(mainTitle, subTitle): > s = '<HR>' > s += '<H2>' + mainTitle + '</H2>' > s += '<H3>' + subTitle + '</H3>' > return s html_title_template = """ <hr> <h2>%(title)s</h2> <h3>%(subtitle)s</h3> """ def html_title(title, subtitle): return html_title_template % dict(title=title, subtitle=subtitle) > > def downloadPod(url, dest): > kb = 2 > success = 0 > command = 'wget --continue -O "' + dest + '" "' + url + '"' command = 'wget --continue -0 "%s" "%s"' % (dest, url) > status = os.system(command) > if status == success: > return True > else: > print "\nWGET:", status > if status == kb: > pass > #raise KeyboardInterrupt > return False > > > def downloadQueue(q, latest): > for x in range(latest): > for [feedTitle, castList] in q: for feedTitle, castList in q: > if not len(castList) > x: double negations are harder to grasp: if len(castList) <= x > continue I'm not sure to get what you're doing here.... > cast = castList[x] > if cast is None: > continue > url = cast.enclosures[0]['href'] > redirect = geturl(url) # TRAFFIC > if type(redirect) != int: #success > render(htmlTitle(feedTitle + ": #" + str(x+1), cast.title)) > render(cast.description) Mixing logic and UI is usually a very bad idea IMHO. > podFile = os.path.basename(redirect).split('?')[0] > permDir = os.path.join(podDir, feedTitle) > permFile = os.path.join(permDir, podFile) > tempDir = os.path.join(downDir, feedTitle) > tempFile = os.path.join(tempDir, podFile) > if not os.path.isfile(permFile): > makeDirs(tempDir, permDir) > if downloadPod(redirect, tempFile): # TRAFFIC > shutil.move(tempFile, permFile) > log(url) > else: > print "EXITING" > sys.exit(2) This will make this function hard to use in a different context (like ie a mod_apache handler, a GUI, etc... You'd better raise some specific exception and let the caller handle it. (snip) > > def log(url): > file(logFile, 'a').write(url + "\n") > > def main(args): > sys.exitfunc = exitFunc > makeDirs(dotDir, podDir, downDir, cacheDir) > #make file if doesn't exist, may be better solution? yes : using the logging module > X = file(logFile, 'a') > latest = 13 #get the first x casts for each feed > try: > opts, args = getopt.getopt(sys.argv[1:], "l:", > ["latest=", "notfound"]) > except getopt.GetoptError: > sys.exit(2) > #usage() > > for opt, arg in opts: > if opt in ("-l", "--latest"): > latest = int(arg) > elif opt in ("--notfound"): > ignoreNotFound = True #add notfound files to log OMG. optparse is far better than this. (snip) My overall feeling is that in most function, you're mixing too much different concerns. Each function should do *one* thing (and do it well). The most obvious problem here is about mixing application logic and UI. Application logic (what your app is effectively doing) should be as independant as possible of UI concerns. If possible, you should be able to reuse most of the application logic (except for the main() func, which in your case is the command-line UI) as a module in other applications. Now there are time when the application code needs to notify the UI. The good news is that Python makes it easy to makes this generic, using callback functions or objects provided by the UI and called when appropriate by the application logic. The key here is to come up with a clear, well-defined interface between logic code and UI code. A last point : avoid globals whenever possible. Either pass values as arguments to functions or use a kind of config object if there are too much configuration stuff. My 2 cents -- bruno desthuilliers python -c "print '@'.join(['.'.join([w[::-1] for w in p.split('.')]) for p in '[EMAIL PROTECTED]'.split('@')])" -- http://mail.python.org/mailman/listinfo/python-list