Try this: ### get_charset.py ### import re import urllib2
def get_charset(url): resp = urllib2.urlopen(url) #retrieve charset from header headers = ''.join(resp.headers.headers) charset_from_header_list = re.findall('charset=(.*)', headers) charset_from_header = charset_from_header_list[-1] if charset_from_header_list else '' #retrieve charset from html html = resp.read() charset_from_html_list = re.findall('Content-Type.*charset=["\']?(.*)["\']', html) charset_from_html = charset_from_html_list[-1] if charset_from_html_list else '' return charset_from_html if charset_from_html else charset_from_header > Date: Sun, 9 Jun 2013 04:47:02 -0700 > Subject: Re: how to detect the character encoding in a web page ? > From: redstone-c...@163.com > To: python-list@python.org > > 在 2012年12月24日星期一UTC+8上午8时34分47秒,iMath写道: > > how to detect the character encoding in a web page ? > > > > such as this page > > > > > > > > http://python.org/ > > Finally ,I found by using PyQt’s QtextStream , QTextCodec and chardet ,we can > get a web page code more securely > even for this bad page > http://www.qnwz.cn/html/yinlegushihui/magazine/2013/0524/425731.html > > this script > http://www.flvxz.com/getFlv.php?url=aHR0cDojI3d3dy41Ni5jb20vdTk1L3ZfT1RFM05UYzBNakEuaHRtbA== > > and this page without chardet in its source code > http://msdn.microsoft.com/en-us/library/bb802962(v=office.12).aspx > > > from PyQt4.QtCore import * > from PyQt4.QtGui import * > from PyQt4.QtNetwork import * > import sys > import chardet > > def slotSourceDownloaded(reply): > redirctLocation=reply.header(QNetworkRequest.LocationHeader) > redirctLocationUrl=reply.url() if not redirctLocation else redirctLocation > #print(redirctLocationUrl,reply.header(QNetworkRequest.ContentTypeHeader)) > > if (reply.error()!= QNetworkReply.NoError): > print('11111111', reply.errorString()) > return > > pageCode=reply.readAll() > charCodecInfo=chardet.detect(pageCode.data()) > > textStream=QTextStream(pageCode) > > codec=QTextCodec.codecForHtml(pageCode,QTextCodec.codecForName(charCodecInfo['encoding'] > )) > textStream.setCodec(codec) > content=textStream.readAll() > print(content) > > if content=='': > print('---------', 'cannot find any resource !') > return > > reply.deleteLater() > qApp.quit() > > > if __name__ == '__main__': > app =QCoreApplication(sys.argv) > manager=QNetworkAccessManager () > url =input('input url :') > request=QNetworkRequest > (QUrl.fromEncoded(QUrl.fromUserInput(url).toEncoded())) > request.setRawHeader("User-Agent" ,'Mozilla/5.0 (Windows NT 5.1) > AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17 SE > 2.X MetaSr 1.0') > manager.get(request) > manager.finished.connect(slotSourceDownloaded) > sys.exit(app.exec_()) > -- > http://mail.python.org/mailman/listinfo/python-list
-- http://mail.python.org/mailman/listinfo/python-list