Mark Russell added the comment:
As Guido requested I've split off the generic reversed() and __reversed__()
doc additions to this patch against 2.6: http://bugs.python.org/issue1582
The I/O error from reversed(open("/etc/passwd")) was caused by the inner
TextIOWrapper calling close() (via the inherited IOBase.__del__() method).
I've fixed it by having TextIOReverseIterator keep a reference to the file
object, and added a test case for the bug.
I think it's at least questionable that TextIOWrapper.close() is calling
buffer.close() on a buffer that it did not create. I assumed that keeping
a reference to the buffer object would be enough to keep the buffer open,
and I suspect this is likely to trip up others in future. I think
TextIOWrapper.close() should probably just set a flag (for the use of its
own closed() method) and rely on reference counting to call close()
on the buffer object. If that sounds on the right lines I'm happy to think
about it a bit more and submit a patch.
Added file: http://bugs.python.org/file8913/reverse-file-iterator-20071210.diff
_____________________________________
Tracker <[EMAIL PROTECTED]>
<http://bugs.python.org/issue1677872>
_____________________________________
Index: Doc/library/stdtypes.rst
===================================================================
--- Doc/library/stdtypes.rst (revision 59453)
+++ Doc/library/stdtypes.rst (working copy)
@@ -1937,7 +1937,16 @@
right. However, using :meth:`seek` to reposition the file to an absolute
position will flush the read-ahead buffer.
+.. method:: file.__reversed__()
+ Return a new iterator that returns lines in reverse order (but without
+ reading the entire file into memory first). Normally called via the
+ :func:`reversed()` builtin, as in ``for line in reversed(f): print(line)``.
+ Useful for scanning backwards through large files without reading the
+ entire file first. Note that this changes the current position of the
+ underlying file object, so you should not interleave use of reverse and
+ forward iteration over the same file object.
+
.. method:: file.read([size])
Read at most *size* bytes from the file (less if the read hits EOF before
Index: Lib/io.py
===================================================================
--- Lib/io.py (revision 59453)
+++ Lib/io.py (working copy)
@@ -1136,6 +1136,126 @@
)[self.seennl]
+class TextIOReverseIterator:
+ """Line-based reverse iterator wrapper for IOBase objects.
+
+ This class is used to implement TextIOWrapper.__reversed__().
+ It searches backwards for encoded line terminator, which
+ works for UTF-8 but not for encodings where one character encoding
+ can be a substring of another longer one.
+ """
+
+ # XXX Should we check for encodings that are known to work? Currently
+ # we would return incorrect results for a codec where, say, the encoding
+ # of newline could appear as a substring of the encoding for some other
+ # character or where the codec can have a non-default state at the start
+ # of a line (do such encodings exist?).
+
+ def __init__(self, buffer, encoding, newline=None,
+ buffer_size=DEFAULT_BUFFER_SIZE, wrapped_file=None):
+ if not isinstance(encoding, str):
+ raise ValueError("invalid encoding: %r" % encoding)
+ buffer.seek(0, 2)
+ self.buffer = buffer
+ self._wrapped_file = wrapped_file # Keep ref to avoid premature close
+ self._bufsize = buffer_size
+ self._encoding = encoding
+ self._translate_newlines = newline is None
+ if newline:
+ self._enc_cr = self._enc_lf = None
+ else:
+ self._enc_cr = '\r'.encode(encoding)
+ self._enc_lf = '\n'.encode(encoding)
+ if self._enc_cr + self._enc_lf != '\r\n'.encode(encoding):
+ raise ValueError('unsupported encoding: %r' % encoding)
+ self._newline = newline.encode(encoding) if newline else None
+ self._limpos = buffer.tell()
+ self._bufpos = self._limpos
+ self._pending = b''
+
+ def _extend_buffer_backwards(self):
+ (bufpos, limpos, bufsize) = (self._bufpos, self._limpos, self._bufsize)
+
+ newpos = (bufpos // bufsize) * bufsize
+ if newpos == bufpos:
+ newpos -= bufsize
+ assert newpos >= 0
+ nbytes = bufpos - newpos
+ assert nbytes != 0
+
+ self.buffer.seek(newpos, 0)
+ assert self.buffer.tell() == newpos, \
+ 'seek() arrived at %r (expected %r)' % (seekpos, newpos)
+ newbuf = self.buffer.read(nbytes)
+ assert len(newbuf) == nbytes, 'Unexpected EOF'
+
+ if limpos > bufpos:
+ newbuf += self._pending[:limpos - bufpos]
+ (self._pending, self._bufpos) = (newbuf, newpos)
+
+ __iter__ = lambda self: self
+
+ # Look backwards for the first occurrence of \r, \n or \r\n.
+ # Return (offset, terminator) or (-1, None) if we need to read more.
+ def _find_universal_endline(self, limpos):
+ enc_cr, enc_lf = self._enc_cr, self._enc_lf
+ cr_pos = self._pending.rfind(enc_cr, 0, limpos)
+ lf_pos = self._pending.rfind(enc_lf, 0, limpos)
+ res = -1, None
+ if lf_pos != -1 and lf_pos > cr_pos:
+ if lf_pos > len(enc_cr) or self._bufpos == 0:
+ if cr_pos != -1 and cr_pos == lf_pos - len(enc_lf):
+ res = cr_pos, enc_cr + enc_lf
+ else:
+ res = lf_pos, enc_lf
+ elif cr_pos != -1:
+ res = cr_pos, enc_cr
+ return res
+
+ def _getbytes(self):
+ is_firstline = self._pending == b''
+ limpos, newline = self._limpos, self._newline
+
+ if limpos is None:
+ raise StopIteration
+ assert limpos >= 0
+
+ # limpos points one character past the end of the line we're about to
+ # return - e.g "abc\ndef"
+ # ^
+ while True:
+ lim_offset = limpos - self._bufpos # file offset -> buf offset
+ if newline is None:
+ offset, ending = self._find_universal_endline(lim_offset)
+ else:
+ offset = self._pending.rfind(newline, 0, lim_offset)
+ ending = newline
+
+ if offset != -1:
+ self._limpos = self._bufpos + offset
+ line_offset = offset + len(ending)
+ break
+
+ if self._bufpos > 0:
+ self._extend_buffer_backwards()
+ else:
+ self._limpos = None
+ line_offset = 0
+ break
+
+ # We treat the first returned line specially, as it may be missing
+ # the endline terminator. Also we avoid returning an initial empty
+ # line for files with a normal terminating endline.
+ #
+ if is_firstline:
+ return self._pending[line_offset:] or self._getbytes()
+ else:
+ ending_to_add = self._enc_lf if self._translate_newlines else
ending
+ return self._pending[line_offset:lim_offset] + ending_to_add
+
+ def __next__(self):
+ return self._getbytes().decode(self._encoding)
+
class TextIOWrapper(TextIOBase):
"""Buffered text stream.
@@ -1382,6 +1502,10 @@
self._pending = res[n:]
return res[:n]
+ def __reversed__(self):
+ return TextIOReverseIterator(self.buffer, self._encoding, self._readnl,
+ wrapped_file=self)
+
def __next__(self):
self._telling = False
line = self.readline()
Index: Lib/test/test_io.py
===================================================================
--- Lib/test/test_io.py (revision 59453)
+++ Lib/test/test_io.py (working copy)
@@ -621,6 +621,88 @@
self.assertEquals(got_line, exp_line)
self.assertEquals(len(got_lines), len(exp_lines))
+ def testReversedLines(self):
+ texts = [
+ "a\nbb\nccc\n\neee\n"
+ "AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG",
+ "",
+ "foo",
+ "\nfoo",
+ "\rbar",
+ "\r\nbaz",
+ "foo\n",
+ "\n\n",
+ ("\0\x0f\xff\u0fff\uffff\U000fffff\U0010ffff"*3 + "\n") * 3 + "\n"
+ ]
+
+ # Check the line splitting logic
+ encodings = [ "utf-8", "latin-1" ]
+ newlines = [ None, "\n", "\r", "\r\n" ]
+ for text in texts:
+ for encoding in encodings:
+ for newline in newlines:
+ for bufsize in None, 1, 2, 3, 5, 10:
+ def make_textio():
+ bufio = io.BytesIO(text.encode(encoding))
+ return io.TextIOWrapper(bufio, encoding=encoding,
+ newline=newline)
+ try:
+ textio = make_textio()
+ except UnicodeEncodeError:
+ # Skip non-ascii tests for latin-1
+ continue
+ if bufsize is None:
+ revio = reversed(textio)
+ else:
+ revio = io.TextIOReverseIterator(
+ textio.buffer, encoding, newline, bufsize)
+ params = dict(text=text, enc=encoding,
+ nl=newline, bufsize=bufsize)
+ got = list(revio)
+ exp = list(reversed(list(make_textio())))
+ self.assertEquals((got, params), (exp, params))
+
+ # Check the normal i/o path
+ path = test_support.TESTFN
+ f = io.open(path, "w+")
+ f.write(texts[0])
+ f.seek(0, 0)
+ lines = f.readlines()
+ f.close()
+ revlines = list(reversed(io.open(path)))
+ self.assertEquals(revlines, list(reversed(lines)))
+
+ def testReversedLinesOpcount(self):
+ import math
+
+ class LoggingRaw (io.RawIOBase):
+ def __init__(self, data):
+ self._bytes = io.BytesIO(data)
+ self._nseeks = self._nreads = 0
+
+ def readinto(self, b):
+ res = self._bytes.readinto(b)
+ #print("readinto => %r" % (res,))
+ self._nreads += 1
+ return res
+
+ def seek(self, pos, whence):
+ res = self._bytes.seek(pos, whence)
+ #print("seek(%r, %r) => %r" % (pos, whence, res))
+ self._nseeks += 1
+ return res
+
+ readable = lambda self: True
+
+ lines = [ "x" * 80 + "\n" ] * 1000 + [ "l" * 1000 ]
+ encoding = "ascii"
+ raw = LoggingRaw(b"".join(line.encode(encoding) for line in lines))
+ textio = io.TextIOWrapper(io.BufferedReader(raw), encoding)
+ self.assertEqual(list(reversed(textio)), list(reversed(lines)))
+ exp_nreads = math.ceil(sum(map(len, lines)) / io.DEFAULT_BUFFER_SIZE)
+ self.assertEqual(raw._nreads, exp_nreads)
+ #print("nseeks=%d nreads=%d" % (raw._nseeks, raw._nreads))
+
def testNewlinesInput(self):
testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG"
normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
@@ -792,7 +874,11 @@
while f.readline():
f.tell()
t4 = timer()
+ for line in reversed(f):
+ pass
+ t5 = timer()
f.close()
+
if test_support.verbose:
print("\nTiming test: %d lines of %d characters (%d bytes)" %
(nlines, nchars, nbytes))
@@ -801,6 +887,7 @@
print("Reading using iteration: %6.3f seconds" % (t2-t1))
print("Reading using readline(): %6.3f seconds" % (t3-t2))
print("Using readline()+tell(): %6.3f seconds" % (t4-t3))
+ print("Using reversed(): %6.3f seconds" % (t5-t4))
def testReadOneByOne(self):
txt = io.TextIOWrapper(io.BytesIO(b"AA\r\nBB"))
Index: Lib/test/output/test_profile
===================================================================
--- Lib/test/output/test_profile (revision 59453)
+++ Lib/test/output/test_profile (working copy)
@@ -10,7 +10,7 @@
12 0.000 0.000 0.012 0.001 :0(hasattr)
1 0.000 0.000 0.000 0.000 :0(setprofile)
1 0.000 0.000 1.000 1.000 <string>:1(<module>)
- 2 0.000 0.000 0.000 0.000 io.py:1211(flush)
+ 2 0.000 0.000 0.000 0.000 io.py:1331(flush)
1 0.000 0.000 0.000 0.000 io.py:269(flush)
1 0.000 0.000 0.000 0.000 io.py:656(closed)
1 0.000 0.000 0.000 0.000 io.py:874(flush)
@@ -33,11 +33,11 @@
:0(append) ->
:0(exc_info) ->
:0(exec) -> <string>:1(<module>)(1) 1.000
- io.py:1211(flush)(2) 0.000
+ io.py:1331(flush)(2) 0.000
:0(hasattr) -> test_profile.py:115(__getattr__)(12)
0.028
:0(setprofile) ->
<string>:1(<module>) -> test_profile.py:30(testfunc)(1)
1.000
-io.py:1211(flush) -> io.py:269(flush)(1) 0.000
+io.py:1331(flush) -> io.py:269(flush)(1) 0.000
io.py:874(flush)(1) 0.000
io.py:269(flush) ->
io.py:656(closed) ->
@@ -74,10 +74,10 @@
test_profile.py:93(helper2)(8)
0.400
:0(setprofile) <- profile:0(testfunc())(1) 1.000
<string>:1(<module>) <- :0(exec)(1) 1.000
-io.py:1211(flush) <- :0(exec)(2) 1.000
-io.py:269(flush) <- io.py:1211(flush)(1) 0.000
+io.py:1331(flush) <- :0(exec)(2) 1.000
+io.py:269(flush) <- io.py:1331(flush)(1) 0.000
io.py:656(closed) <- io.py:874(flush)(1) 0.000
-io.py:874(flush) <- io.py:1211(flush)(1) 0.000
+io.py:874(flush) <- io.py:1331(flush)(1) 0.000
profile:0(profiler) <-
profile:0(testfunc()) <- profile:0(profiler)(1) 0.000
test_profile.py:103(subhelper) <- test_profile.py:93(helper2)(8)
0.400
Index: Lib/test/output/test_cProfile
===================================================================
--- Lib/test/output/test_cProfile (revision 59453)
+++ Lib/test/output/test_cProfile (working copy)
@@ -5,7 +5,7 @@
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 1.000 1.000 <string>:1(<module>)
- 2 0.000 0.000 0.000 0.000 io.py:1211(flush)
+ 2 0.000 0.000 0.000 0.000 io.py:1331(flush)
1 0.000 0.000 0.000 0.000 io.py:269(flush)
1 0.000 0.000 0.000 0.000 io.py:656(closed)
1 0.000 0.000 0.000 0.000 io.py:874(flush)
@@ -30,7 +30,7 @@
Function called...
ncalls tottime cumtime
<string>:1(<module>) -> 1 0.270 1.000
test_cProfile.py:30(testfunc)
-io.py:1211(flush) -> 1 0.000 0.000
io.py:269(flush)
+io.py:1331(flush) -> 1 0.000 0.000
io.py:269(flush)
1 0.000 0.000
io.py:874(flush)
io.py:269(flush) ->
io.py:656(closed) ->
@@ -53,7 +53,7 @@
test_cProfile.py:93(helper2) -> 8 0.064 0.080
test_cProfile.py:103(subhelper)
8 0.000 0.008
{hasattr}
{exec} -> 1 0.000 1.000
<string>:1(<module>)
- 2 0.000 0.000
io.py:1211(flush)
+ 2 0.000 0.000
io.py:1331(flush)
{hasattr} -> 12 0.012 0.012
test_cProfile.py:115(__getattr__)
{method 'append' of 'list' objects} ->
{method 'disable' of '_lsprof.Profiler' objects} ->
@@ -65,10 +65,10 @@
Function was called by...
ncalls tottime cumtime
<string>:1(<module>) <- 1 0.000 1.000
{exec}
-io.py:1211(flush) <- 2 0.000 0.000
{exec}
-io.py:269(flush) <- 1 0.000 0.000
io.py:1211(flush)
+io.py:1331(flush) <- 2 0.000 0.000
{exec}
+io.py:269(flush) <- 1 0.000 0.000
io.py:1331(flush)
io.py:656(closed) <- 1 0.000 0.000
io.py:874(flush)
-io.py:874(flush) <- 1 0.000 0.000
io.py:1211(flush)
+io.py:874(flush) <- 1 0.000 0.000
io.py:1331(flush)
test_cProfile.py:103(subhelper) <- 8 0.064 0.080
test_cProfile.py:93(helper2)
test_cProfile.py:115(__getattr__) <- 16 0.016 0.016
test_cProfile.py:103(subhelper)
12 0.012 0.012
{hasattr}
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com