I am trying to build a Python script that reads a Sitemap file and
push the URLs to a Google Search Appliance.  I am able to fetch the
XML document and parse it with regular expressions but I want to move
to using native XML tools to do this.  The problem I am getting is if
I use urllib.urlopen(url) I can convert the IO Stream to a XML
document but if I use urllib2.urlopen and then read the response, I
get the content but when I use minidom.parse() I get a "IOError:
[Errno 2] No such file or directory:" error

THIS WORKS but will have issues if the IO Stream is a compressed file
def GetPageGuts(net, url):
        pageguts = urllib.urlopen(url)
        xmldoc = minidom.parse(pageguts)
        return xmldoc

# THIS DOESN'T WORK, but I don't understand why
def GetPageGuts(net, url):
        request=getRequest_obj(net, url)
        response = urllib2.urlopen(request)
        response.headers.items()
        pageguts = response.read()
        # Test to see if the response is a gzip/compressed data stream
        if isCompressedFile(response, url):
                compressedstream = StringIO.StringIO(pageguts)
                gzipper = gzip.GzipFile(fileobj = compressedstream)
                pageguts = gzipper.read()
        xmldoc = minidom.parse(pageguts)
        response.close()
        return xmldoc

# I am getting the following error
Starting SiteMap Manager ...
Traceback (most recent call last):
  File "./tester.py", line 267, in ?
    main()
  File "./tester.py", line 49, in main
    fetchSiteMap(ResourceDict, line)
  File "./tester.py", line 65, in fetchSiteMap
    pageguts = GetPageGuts(ResourceDict['NET'], url)
  File "./tester.py", line 89, in GetPageGuts
    xmldoc = minidom.parse(pageguts)
  File "/usr/lib/python2.4/xml/dom/minidom.py", line 1915, in parse
    return expatbuilder.parse(file)
  File "/usr/lib/python2.4/xml/dom/expatbuilder.py", line 922, in
parse
    fp = open(file, 'rb')
IOError: [Errno 2] No such file or directory: '<?xml version="1.0"
encoding="UTF-8"?>\n<sitemapindex xmlns="http://www.sitemaps.org/
schemas/sitemap/0.9">\n<sitemap>\n<loc>http://www.myorg.org/janes/
sitemaps/binder_sitemap.xml</loc>\n<lastmod>2010-09-09</lastmod>\n</
sitemap>\n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/
dir_sitemap.xml</loc>\n<lastmod>2010-05-05</lastmod>\n</sitemap>
\n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/
mags_sitemap.xml</loc>\n<lastmod>2010-09-09</lastmod>\n</sitemap>
\n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/
news_sitemap.xml</loc>\n<lastmod>2010-09-09</lastmod>\n</sitemap>
\n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/
sent_sitemap.xml</loc>\n<lastmod>2010-09-09</lastmod>\n</sitemap>
\n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/
srep_sitemap.xml</loc>\n<lastmod>2001-05-04</lastmod>\n</sitemap>
\n<sitemap>\n<loc>http://www.myorg.org/janes/sitemaps/yb_sitemap.xml</
loc>\n<lastmod>2010-09-09</lastmod>\n</sitemap>\n</sitemapindex>\n'

# A couple of supporting things
def getRequest_obj(net, url):
        request = urllib2.Request(url)
        request.add_header('User-Agent', 'ICES Sitemap Bot dni-ices-
searchad...@ugov.gov')
        request.add_header('Accept-encoding', 'gzip')
        return request

def isCompressedFile(r, u):
        answer=False
        if r.headers.has_key('Content-encoding'):
                answer=True
        else:
                # Check to see if the URL ends in .gz
                if u.endswith(".gz"):
                        answer=True
        return answer

-- 
http://mail.python.org/mailman/listinfo/python-list

Reply via email to