hey guys... got a weird, hopefully simple issue.
the following sample bit of script is stripped down, and simply gets the "form" node from the specified site "schedule.psu.edu". the problem i run into is that the dom/xpath from the libxml2dom works, and i get the dom object everytime i run the app, but that the xpath is intermittent!!! in other words, i can run the script 10 times.. and it might work 7 or 8 times.. the other times, the xpath doesn't give the nodes back... when it works, name1_ in the app should be a list of nodes (for the 2 forms in the page). and len_ should be 2. is there anything you might suggest that i try in order to get a better handle on exactly what might be going on here... keep in mind, i'm not a python guy, just trying to get this to consistently work... my suspicion is that the culprit might be memory related... i'm running linux, on a x86 dual core with 4G ram. the python is 2.5.1. thoughts/comments/etc would be appreciated... -thanks!!! #!/usr/bin/python # # test.py # # scrapes/extracts the basic data for the college # # # the app gets/stores # name # url # address (street/city/state # phone # ###################################################################### #test python script import re import libxml2dom import urllib import urllib2 import sys, string from mechanize import Browser import mechanize #import tidy import os.path import cookielib from libxml2dom import Node from libxml2dom import NodeList import subprocess import time ######################## # # Parse pricegrabber.com ######################## ##cj = "p" ##COOKIEFILE = 'cookies.lwp' #cookielib = 1 urlopen = urllib2.urlopen #cj = urllib2.cookielib.LWPCookieJar() ##cj = cookielib.LWPCookieJar() Request = urllib2.Request br = Browser() br2 = Browser() ##if cj != None: ## print "sss" ###install the CookieJar for the default CookieProcessor ## if os.path.isfile(COOKIEFILE): ## cj.load(COOKIEFILE) ## print "foo\n" ## if cookielib: ## opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) ## urllib2.install_opener(opener) ## print "foo2\n" user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' values1 = {'name' : 'Michael Foord', 'location' : 'Northampton', 'language' : 'Python' } headers = { 'User-Agent' : user_agent } url="http://schedule.psu.edu/" #======================================= if __name__ == "__main__": # main app txdata = None #---------------------------- ##br.set_cookiejar(cj) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.addheaders = [('User-Agent', 'Firefox')] print "url =",url br.open(url) ##cj.save(COOKIEFILE) # resave cookies res = br.response() # this is a copy of response s = res.read() print "slen=",len(s) # s contains HTML not XML text d = libxml2dom.parseString(s, html=1) print "d",d name_=[] len_=0 name_ = d.xpath("//form") #name_ = d.xpath("/html/body/form") print "name1",name_ len_ = len(name_) print "len",len(name_) #print "sdlfs" sys.exit() # else: # print "err in form_ID" print "here..." -- http://mail.python.org/mailman/listinfo/python-list