STINNER Victor <victor.stin...@haypocalc.com> added the comment:
On Saturday 06 November 2010 17:00:15 you wrote:
> Note that it is useful for opening any text file with an encoding cookie,
> not only python source code, so "tokenize.open()" sounds attractive.
Ok, the new patch (tokenize_open-2.patch) uses tokenize.open() name and adds a
test for BOM without coding cookie (test utf-8-sig encoding).
----------
Added file: http://bugs.python.org/file19529/tokenize_open-2.patch
_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue10335>
_______________________________________
Index: Doc/library/tokenize.rst
===================================================================
--- Doc/library/tokenize.rst (révision 86167)
+++ Doc/library/tokenize.rst (copie de travail)
@@ -101,16 +101,18 @@
If no encoding is specified, then the default of ``'utf-8'`` will be
returned.
- :func:`detect_encoding` is useful for robustly reading Python source files.
- A common pattern for this follows::
+ Use :func:`open` to open Python script: it uses :func:`detect_encoding` to
+ detect the file encoding.
- def read_python_source(file_name):
- with open(file_name, "rb") as fp:
- encoding = tokenize.detect_encoding(fp.readline)[0]
- with open(file_name, "r", encoding=encoding) as fp:
- return fp.read()
+.. function:: open(filename)
+ Open a Python script in read mode using the encoding detected by
+ :func:`detect_encoding`.
+
+ .. versionadded:: 3.2
+
+
Example of a script rewriter that transforms float literals into Decimal
objects::
@@ -153,4 +155,3 @@
result.append((toknum, tokval))
return untokenize(result).decode('utf-8')
-
Index: Lib/py_compile.py
===================================================================
--- Lib/py_compile.py (révision 86167)
+++ Lib/py_compile.py (copie de travail)
@@ -104,9 +104,7 @@
byte-compile all installed files (or all files in selected
directories).
"""
- with open(file, "rb") as f:
- encoding = tokenize.detect_encoding(f.readline)[0]
- with open(file, encoding=encoding) as f:
+ with tokenize.open(file) as f:
try:
timestamp = int(os.fstat(f.fileno()).st_mtime)
except AttributeError:
Index: Lib/tabnanny.py
===================================================================
--- Lib/tabnanny.py (révision 86167)
+++ Lib/tabnanny.py (copie de travail)
@@ -93,11 +93,8 @@
check(fullname)
return
- with open(file, 'rb') as f:
- encoding, lines = tokenize.detect_encoding(f.readline)
-
try:
- f = open(file, encoding=encoding)
+ f = tokenize.open(file)
except IOError as msg:
errprint("%r: I/O Error: %s" % (file, msg))
return
Index: Lib/tokenize.py
===================================================================
--- Lib/tokenize.py (révision 86167)
+++ Lib/tokenize.py (copie de travail)
@@ -29,6 +29,7 @@
from token import *
from codecs import lookup, BOM_UTF8
import collections
+from io import TextIOWrapper
cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
import token
@@ -335,6 +336,20 @@
return default, [first, second]
+_builtin_open = open
+
+def open(filename):
+ """
+ Open a Python script in read mode with the right encoding.
+ """
+ buffer = _builtin_open(filename, 'rb')
+ encoding, line = detect_encoding(buffer.readline)
+ buffer.seek(0)
+ text = TextIOWrapper(buffer, encoding, line_buffering=True)
+ text.mode = 'r'
+ return text
+
+
def tokenize(readline):
"""
The tokenize() generator requires one argment, readline, which
Index: Lib/trace.py
===================================================================
--- Lib/trace.py (révision 86167)
+++ Lib/trace.py (copie de travail)
@@ -419,10 +419,9 @@
def find_executable_linenos(filename):
"""Return dict where keys are line numbers in the line number table."""
try:
- with io.FileIO(filename, 'r') as file:
- encoding, lines = tokenize.detect_encoding(file.readline)
- with open(filename, "r", encoding=encoding) as f:
+ with tokenize.open_python(filename) as f:
prog = f.read()
+ encoding = f.encoding
except IOError as err:
print(("Not printing coverage data for %r: %s"
% (filename, err)), file=sys.stderr)
Index: Lib/linecache.py
===================================================================
--- Lib/linecache.py (révision 86167)
+++ Lib/linecache.py (copie de travail)
@@ -123,9 +123,7 @@
else:
return []
try:
- with open(fullname, 'rb') as fp:
- coding, line = tokenize.detect_encoding(fp.readline)
- with open(fullname, 'r', encoding=coding) as fp:
+ with tokenize.open(fullname) as fp:
lines = fp.readlines()
except IOError:
return []
Index: Lib/test/test_tokenize.py
===================================================================
--- Lib/test/test_tokenize.py (révision 86167)
+++ Lib/test/test_tokenize.py (copie de travail)
@@ -564,7 +564,8 @@
from test import support
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
- STRING, ENDMARKER, tok_name, detect_encoding)
+ STRING, ENDMARKER, tok_name, detect_encoding,
+ open as tokenize_open)
from io import BytesIO
from unittest import TestCase
import os, sys, glob
@@ -857,6 +858,27 @@
readline = self.get_readline((b'# coding: bad\n',))
self.assertRaises(SyntaxError, detect_encoding, readline)
+ def test_open(self):
+ filename = support.TESTFN + '.py'
+ self.addCleanup(support.unlink, filename)
+
+ #Â test coding cookie
+ for encoding in ('iso-8859-15', 'utf-8'):
+ with open(filename, 'w', encoding=encoding) as fp:
+ print("# coding: %s" % encoding, file=fp)
+ print("print('euro:\u20ac')", file=fp)
+ with tokenize_open(filename) as fp:
+ assert fp.encoding == encoding
+ assert fp.mode == 'r'
+
+ #Â test BOM (no coding cookie)
+ encoding = 'utf-8-sig'
+ with open(filename, 'w', encoding=encoding) as fp:
+ print("print('euro:\u20ac')", file=fp)
+ with tokenize_open(filename) as fp:
+ assert fp.encoding == encoding
+ assert fp.mode == 'r'
+
class TestTokenize(TestCase):
def test_tokenize(self):
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com