Reran the programs taking a bit more care with the encoding of the
file. This had no effect on the speeds. There are only a small amount of
paths that don't fit into ASCII:
ASCII 1076101
Latin1 218
BMP 113
Astral 0
# encoding:utf-8
import codecs, os, time
from os.path import join, getsize
with codecs.open("filelist.txt", "r", "utf-8") as f:
paths = f.read().split("\n")
bucket = [0,0,0,0]
for p in paths:
b = 0
maxChar = max([ord(ch) for ch in p])
if maxChar >= 65536:
b = 3
elif maxChar >= 256:
b = 2
elif maxChar >= 128:
b = 1
bucket[b] = bucket[b] + 1
print("ASCII", bucket[0])
print("Latin1", bucket[1])
print("BMP", bucket[2])
print("Astral", bucket[3])
Neil
--
http://mail.python.org/mailman/listinfo/python-list