I am trying to build a Python script that reads a Sitemap file and push the URLs to a Google Search Appliance. I am able to fetch the XML document and parse it with regular expressions but I want to move to using native XML tools to do this. The problem I am getting is if I use urllib.urlopen(url) I can convert the IO Stream to a XML document but if I use urllib2.urlopen and then read the response, I get the content but when I use minidom.parse() I get a "IOError: [Errno 2] No such file or directory:" error
THIS WORKS but will have issues if the IO Stream is a compressed file def GetPageGuts(net, url): pageguts = urllib.urlopen(url) xmldoc = minidom.parse(pageguts) return xmldoc # THIS DOESN'T WORK, but I don't understand why def GetPageGuts(net, url): request=getRequest_obj(net, url) response = urllib2.urlopen(request) response.headers.items() pageguts = response.read() # Test to see if the response is a gzip/compressed data stream if isCompressedFile(response, url): compressedstream = StringIO.StringIO(pageguts) gzipper = gzip.GzipFile(fileobj = compressedstream) pageguts = gzipper.read() xmldoc = minidom.parse(pageguts) response.close() return xmldoc # I am getting the following error Starting SiteMap Manager ... Traceback (most recent call last): File "./tester.py", line 267, in ? main() File "./tester.py", line 49, in main fetchSiteMap(ResourceDict, line) File "./tester.py", line 65, in fetchSiteMap pageguts = GetPageGuts(ResourceDict['NET'], url) File "./tester.py", line 89, in GetPageGuts xmldoc = minidom.parse(pageguts) File "/usr/lib/python2.4/xml/dom/minidom.py", line 1915, in parse return expatbuilder.parse(file) File "/usr/lib/python2.4/xml/dom/expatbuilder.py", line 922, in parse fp = open(file, 'rb') IOError: [Errno 2] No such file or directory: '<?xml version="1.0" encoding="UTF-8"?>\n<sitemapindex xmlns="http://www.sitemaps.org/ schemas/sitemap/0.9">\n<sitemap>\n<loc>http://www.myorg.org/janes/ sitemaps/binder_sitemap.xml</loc>\n<lastmod>2010-09-09</lastmod>\n</ sitemap>\n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/ dir_sitemap.xml</loc>\n<lastmod>2010-05-05</lastmod>\n</sitemap> \n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/ mags_sitemap.xml</loc>\n<lastmod>2010-09-09</lastmod>\n</sitemap> \n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/ news_sitemap.xml</loc>\n<lastmod>2010-09-09</lastmod>\n</sitemap> \n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/ sent_sitemap.xml</loc>\n<lastmod>2010-09-09</lastmod>\n</sitemap> \n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/ srep_sitemap.xml</loc>\n<lastmod>2001-05-04</lastmod>\n</sitemap> \n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/yb_sitemap.xml</ loc>\n<lastmod>2010-09-09</lastmod>\n</sitemap>\n</sitemapindex>\n' # A couple of supporting things def getRequest_obj(net, url): request = urllib2.Request(url) request.add_header('User-Agent', 'ICES Sitemap Bot dni-ices- searchad...@ugov.gov') request.add_header('Accept-encoding', 'gzip') return request def isCompressedFile(r, u): answer=False if r.headers.has_key('Content-encoding'): answer=True else: # Check to see if the URL ends in .gz if u.endswith(".gz"): answer=True return answer -- http://mail.python.org/mailman/listinfo/python-list