On Wed, Apr 12, 2017 at 02:28:48PM +0200, Jürgen Spitzmüller wrote: > > There seem to be some ways to detect the file encoding in python: > http://stackoverflow.com/questions/436220/determine-the-encoding-of-tex > t-in-python
I find that the chardet library is the least reliable. I get better results with python-magic. However, all methods are unreliable to some extent. Try the attached patch to see in how many encodings you can read a file. Try it also on binary files ;) -- Enrico
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function import os,sys,io from getopt import getopt def error(message): sys.stderr.write(message + '\n') sys.exit(1) def usage(prog_name): return "Usage: %s [-v] <files>" % os.path.basename(prog_name) encodings = ["ascii", "utf-8", "utf-16", "utf-32", "iso-8859-1", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-9", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "big5", "big5hkscs", "cp037", "cp424", "cp437", "cp500", "cp720", "cp737", "cp775", "cp850", "cp852", "cp855", "cp856", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863", "cp864", "cp865", "cp866", "cp869", "cp874", "cp875", "cp932", "cp949", "cp950", "cp1006", "cp1026", "cp1140", "cp1250", "cp1251", "cp1252", "cp1253", "cp1254", "cp1255", "cp1256", "cp1257", "cp1258", "euc_jp", "euc_jis_2004", "euc_jisx0213", "euc_kr", "gb2312", "gbk", "gb18030", "hz", "iso2022_jp", "iso2022_jp_1", "iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_3", "iso2022_jp_ext", "iso2022_kr", "johab", "koi8_r", "koi8_u", "mac_cyrillic", "mac_greek", "mac_iceland", "mac_latin2", "mac_roman", "mac_turkish", "ptcp154", "shift_jis", "shift_jis_2004", "shift_jisx0213", "utf_32_be", "utf_32_le", "utf_16_be", "utf_16_le", "utf_7", "utf_8_sig"] verbose = False try: (options, args) = getopt(sys.argv[1:], "hv") except: error(usage(sys.argv[0])) if len(args) == 0: error(usage(sys.argv[0])) for (opt, param) in options: if opt == "-h": print(usage(sys.argv[0])) if opt == "-v": verbose = True for file in args: if not os.path.isdir(file): first = True length = 0 for e in encodings: try: fh = io.open(file, 'r', encoding=e) fh.readlines() fh.close() except: if verbose: print('%s: got unicode error with %s' % (file, e)) else: if verbose: print('%s: success with encoding: %s' % (file, e)) else: if first: print('%s: %s' % (file, e), end='') length += len(file)+len(e)+2 first = False else: if length+len(e)+2 > 79: print(',\n\t%s' % e, end='') length = len(e)+8 else: print(', %s' % e, end='') length += len(e)+2 sys.stdout.flush() if not first: print('') else: print('%s: directory' % file)