在 2012年12月24日星期一UTC+8上午8时34分47秒,iMath写道: > how to detect the character encoding in a web page ? > > such as this page > > > > http://python.org/
first setup chardet import chardet #抓取网页html html_1 = urllib2.urlopen(line,timeout=120).read() #print html_1 mychar=chardet.detect(html_1) #print mychar bianma=mychar['encoding'] if bianma == 'utf-8' or bianma == 'UTF-8': #html=html.decode('utf-8','ignore').encode('utf-8') html=html_1 else : html =html_1.decode('gb2312','ignore').encode('utf-8') -- http://mail.python.org/mailman/listinfo/python-list