On Wed, Apr 12, 2017 at 02:28:48PM +0200, Jürgen Spitzmüller wrote:
> 
> There seem to be some ways to detect the file encoding in python:
> http://stackoverflow.com/questions/436220/determine-the-encoding-of-tex
> t-in-python

I find that the chardet library is the least reliable. I get better
results with python-magic. However, all methods are unreliable to
some extent. Try the attached patch to see in how many encodings
you can read a file. Try it also on binary files ;)

-- 
Enrico
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import os,sys,io
from getopt import getopt

def error(message):
    sys.stderr.write(message + '\n')
    sys.exit(1)

def usage(prog_name):
    return "Usage: %s [-v] <files>" % os.path.basename(prog_name)

encodings = ["ascii", "utf-8", "utf-16", "utf-32", "iso-8859-1", "iso-8859-2",
 "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7",
 "iso-8859-8", "iso-8859-9", "iso-8859-10", "iso-8859-13", "iso-8859-14",
 "iso-8859-15", "iso-8859-16", "big5", "big5hkscs", "cp037", "cp424", "cp437",
 "cp500", "cp720", "cp737", "cp775", "cp850", "cp852", "cp855", "cp856",
 "cp857", "cp858", "cp860", "cp861", "cp862", "cp863", "cp864", "cp865",
 "cp866", "cp869", "cp874", "cp875", "cp932", "cp949", "cp950", "cp1006",
 "cp1026", "cp1140", "cp1250", "cp1251", "cp1252", "cp1253", "cp1254",
 "cp1255", "cp1256", "cp1257", "cp1258", "euc_jp", "euc_jis_2004",
 "euc_jisx0213", "euc_kr", "gb2312", "gbk", "gb18030", "hz", "iso2022_jp",
 "iso2022_jp_1", "iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_3",
 "iso2022_jp_ext", "iso2022_kr", "johab", "koi8_r", "koi8_u", "mac_cyrillic",
 "mac_greek", "mac_iceland", "mac_latin2", "mac_roman", "mac_turkish",
 "ptcp154", "shift_jis", "shift_jis_2004", "shift_jisx0213", "utf_32_be",
 "utf_32_le", "utf_16_be", "utf_16_le", "utf_7", "utf_8_sig"]

verbose = False

try:
    (options, args) = getopt(sys.argv[1:], "hv")
except:
    error(usage(sys.argv[0]))

if len(args) == 0:
    error(usage(sys.argv[0]))

for (opt, param) in options:
    if opt == "-h":
        print(usage(sys.argv[0]))
    if opt == "-v":
        verbose = True

for file in args:
    if not os.path.isdir(file):
        first = True
        length = 0
        for e in encodings:
            try:
                fh = io.open(file, 'r', encoding=e)
                fh.readlines()
                fh.close()
            except:
                if verbose:
                    print('%s: got unicode error with %s' % (file, e))
            else:
                if verbose:
                    print('%s: success with encoding: %s' % (file, e))
                else:
                    if first:
                        print('%s: %s' % (file, e), end='')
                        length += len(file)+len(e)+2
                        first = False
                    else:
                        if length+len(e)+2 > 79:
                            print(',\n\t%s' % e, end='')
                            length = len(e)+8
                        else:
                            print(', %s' % e, end='')
                            length += len(e)+2
                    sys.stdout.flush()
        if not first:
            print('')
    else:
        print('%s: directory' % file)

Reply via email to