try using parse from string ... and try minidom.parse(StringIO.StingIO(string)).documentElement
On Fri, Sep 10, 2010 at 9:50 PM, jakecjacobson <jakecjacob...@gmail.com>wrote: > I am trying to build a Python script that reads a Sitemap file and > push the URLs to a Google Search Appliance. I am able to fetch the > XML document and parse it with regular expressions but I want to move > to using native XML tools to do this. The problem I am getting is if > I use urllib.urlopen(url) I can convert the IO Stream to a XML > document but if I use urllib2.urlopen and then read the response, I > get the content but when I use minidom.parse() I get a "IOError: > [Errno 2] No such file or directory:" error > > THIS WORKS but will have issues if the IO Stream is a compressed file > def GetPageGuts(net, url): > pageguts = urllib.urlopen(url) > xmldoc = minidom.parse(pageguts) > return xmldoc > > # THIS DOESN'T WORK, but I don't understand why > def GetPageGuts(net, url): > request=getRequest_obj(net, url) > response = urllib2.urlopen(request) > response.headers.items() > pageguts = response.read() > # Test to see if the response is a gzip/compressed data stream > if isCompressedFile(response, url): > compressedstream = StringIO.StringIO(pageguts) > gzipper = gzip.GzipFile(fileobj = compressedstream) > pageguts = gzipper.read() > xmldoc = minidom.parse(pageguts) > response.close() > return xmldoc > > # I am getting the following error > Starting SiteMap Manager ... > Traceback (most recent call last): > File "./tester.py", line 267, in ? > main() > File "./tester.py", line 49, in main > fetchSiteMap(ResourceDict, line) > File "./tester.py", line 65, in fetchSiteMap > pageguts = GetPageGuts(ResourceDict['NET'], url) > File "./tester.py", line 89, in GetPageGuts > xmldoc = minidom.parse(pageguts) > File "/usr/lib/python2.4/xml/dom/minidom.py", line 1915, in parse > return expatbuilder.parse(file) > File "/usr/lib/python2.4/xml/dom/expatbuilder.py", line 922, in > parse > fp = open(file, 'rb') > IOError: [Errno 2] No such file or directory: '<?xml version="1.0" > encoding="UTF-8"?>\n<sitemapindex xmlns="http://www.sitemaps.org/ > schemas/sitemap/0.9">\n<sitemap>\n<loc>http://www.myorg.org/janes/ > sitemaps/binder_sitemap.xml</loc>\n<lastmod>2010-09-09</lastmod>\n</ > sitemap>\n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/ > dir_sitemap.xml</loc>\n<lastmod>2010-05-05</lastmod>\n</sitemap> > \n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/ > mags_sitemap.xml</loc>\n<lastmod>2010-09-09</lastmod>\n</sitemap> > \n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/ > news_sitemap.xml</loc>\n<lastmod>2010-09-09</lastmod>\n</sitemap> > \n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/ > sent_sitemap.xml</loc>\n<lastmod>2010-09-09</lastmod>\n</sitemap> > \n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/ > srep_sitemap.xml</loc>\n<lastmod>2001-05-04</lastmod>\n</sitemap> > \n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/yb_sitemap.xml</ > loc>\n<lastmod>2010-09-09</lastmod>\n</sitemap>\n</sitemapindex>\n' > > # A couple of supporting things > def getRequest_obj(net, url): > request = urllib2.Request(url) > request.add_header('User-Agent', 'ICES Sitemap Bot dni-ices- > searchad...@ugov.gov') > request.add_header('Accept-encoding', 'gzip') > return request > > def isCompressedFile(r, u): > answer=False > if r.headers.has_key('Content-encoding'): > answer=True > else: > # Check to see if the URL ends in .gz > if u.endswith(".gz"): > answer=True > return answer > > -- > http://mail.python.org/mailman/listinfo/python-list > -- Nitin Pawar
-- http://mail.python.org/mailman/listinfo/python-list