Chris
import urllib2
url = 'www.someurl.com' f = urllib2.urlopen(url) data = f.read() # if it is not in the pagecode, how do i get the encoding of the page? pageencoding = '???' xmlencoding = 'whatever i parsed out of the file' htmlmetaencoding = 'whatever i parsed out of the metatag' f.close() try: data = data.decode(pageencoding) except: try: data = data.decode(xmlencoding) except: try: data = data.decode(htmlmetaencoding) except: try: data = data.encode('UTF-8') except: flag = true for char in data: if 127 < ord(char) < 160: flag = false if flag: try: data = data.encode('latin-1') except: pass try: data = data.encode('cp1252') except: pass try: data = data.encode('latin-1') except: pass data = data.encode("ascii", "xmlcharrefreplace")
-- http://mail.python.org/mailman/listinfo/python-list