Serhiy Storchaka added the comment:
Patch updated. Fixed one error. Now readline() optimized too.
Benchmark results (reading python.bz2):
Py2.7 Py3.2 Py3.3 Py3.3
vanilla patched
4.8 4.8 - 31 read(1)
1 0.94 3.4e+02 3.6 read(10)
0.61 0.6 28 0.87 read(100)
0.58 0.58 3.4 0.61 read(1000)
0.59 0.57 0.88 0.58 read(10000)
0.57 0.56 0.62 0.58 read(100000)
0.57 0.57 0.59 0.58 read()
- - - 30 read1(1)
- - 3.2e+02 3.6 read1(10)
- - 27 0.88 read1(100)
- - 3.3 0.61 read1(1000)
- - 0.88 0.58 read1(10000)
- - 0.61 0.57 read1(100000)
- - 0.58 0.57 read1()
1.7 1.7 11 0.67 readline()
----------
Added file: http://bugs.python.org/file27314/bz2_faster_read.patch
_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue16034>
_______________________________________
diff -r 6f456d9add40 Lib/bz2.py
--- a/Lib/bz2.py Wed Sep 26 13:11:48 2012 +0200
+++ b/Lib/bz2.py Wed Sep 26 16:49:44 2012 +0300
@@ -79,7 +79,8 @@
mode = "rb"
mode_code = _MODE_READ
self._decompressor = BZ2Decompressor()
- self._buffer = None
+ self._buffer = b''
+ self._offset = 0
elif mode in ("w", "wb"):
mode = "wb"
mode_code = _MODE_WRITE
@@ -124,7 +125,8 @@
self._fp = None
self._closefp = False
self._mode = _MODE_CLOSED
- self._buffer = None
+ self._buffer = b''
+ self._offset = 0
@property
def closed(self):
@@ -172,14 +174,14 @@
raise io.UnsupportedOperation("The underlying file object "
"does not support seeking")
- # Fill the readahead buffer if it is empty. Returns False on EOF.
- def _fill_buffer(self):
+ # Non-buffered read and decompress next chunk of data.
+ # Always returns at least one byte of data, unless at EOF.
+ def _read1(self):
# Depending on the input data, our call to the decompressor may not
# return any data. In this case, try again after reading another block.
+ if self._mode == _MODE_READ_EOF:
+ return b''
while True:
- if self._buffer:
- return True
-
if self._decompressor.unused_data:
rawblock = self._decompressor.unused_data
else:
@@ -189,48 +191,70 @@
if self._decompressor.eof:
self._mode = _MODE_READ_EOF
self._size = self._pos
- return False
+ return b''
else:
raise EOFError("Compressed file ended before the "
- "end-of-stream marker was reached")
+ "end-of-stream marker was reached")
# Continue to next stream.
if self._decompressor.eof:
self._decompressor = BZ2Decompressor()
- self._buffer = self._decompressor.decompress(rawblock)
+ data = self._decompressor.decompress(rawblock)
+ if data:
+ return data
# Read data until EOF.
# If return_data is false, consume the data without returning it.
def _read_all(self, return_data=True):
+ data = self._buffer[self._offset:]
blocks = []
- while self._fill_buffer():
+ self._buffer = b''
+ self._offset = 0
+ while True:
if return_data:
- blocks.append(self._buffer)
- self._pos += len(self._buffer)
- self._buffer = None
+ blocks.append(data)
+ self._pos += len(data)
+ data = self._read1()
+ if not data:
+ break
if return_data:
return b"".join(blocks)
# Read a block of up to n bytes.
# If return_data is false, consume the data without returning it.
def _read_block(self, n, return_data=True):
+ if n <= 0:
+ return b''
+ end = n + self._offset
+ data = self._buffer[self._offset:end]
+ if end <= len(self._buffer):
+ self._offset = end
+ self._pos += len(data)
+ return data
+
blocks = []
- while n > 0 and self._fill_buffer():
- if n < len(self._buffer):
- data = self._buffer[:n]
- self._buffer = self._buffer[n:]
- else:
- data = self._buffer
- self._buffer = None
+ self._buffer = b''
+ self._offset = 0
+ while True:
if return_data:
blocks.append(data)
self._pos += len(data)
n -= len(data)
+ if not n:
+ break
+ data = self._read1()
+ if not data:
+ break
+ if n < len(data):
+ self._buffer = data
+ self._offset = n
+ data = data[:n]
+
if return_data:
return b"".join(blocks)
- def peek(self, n=0):
+ def peek(self, n=1):
"""Return buffered data without advancing the file position.
Always returns at least one byte of data, unless at EOF.
@@ -238,9 +262,11 @@
"""
with self._lock:
self._check_can_read()
- if self._mode == _MODE_READ_EOF or not self._fill_buffer():
- return b""
- return self._buffer
+ data = self._buffer[self._offset:]
+ if not data:
+ self._buffer = data = self._read1()
+ self._offset = 0
+ return data
def read(self, size=-1):
"""Read up to size uncompressed bytes from the file.
@@ -250,9 +276,7 @@
"""
with self._lock:
self._check_can_read()
- if self._mode == _MODE_READ_EOF or size == 0:
- return b""
- elif size < 0:
+ if size < 0:
return self._read_all()
else:
return self._read_block(size)
@@ -268,15 +292,18 @@
# In this case we make multiple reads, to avoid returning b"".
with self._lock:
self._check_can_read()
- if (size == 0 or self._mode == _MODE_READ_EOF or
- not self._fill_buffer()):
+ if size == 0:
return b""
- if 0 < size < len(self._buffer):
- data = self._buffer[:size]
- self._buffer = self._buffer[size:]
+ if self._offset == len(self._buffer):
+ self._buffer = self._read1()
+ self._offset = 0
+ if size > 0:
+ data = self._buffer[self._offset:self._offset + size]
+ self._offset += len(data)
else:
- data = self._buffer
- self._buffer = None
+ data = self._buffer[self._offset:]
+ self._buffer = b''
+ self._offset = 0
self._pos += len(data)
return data
@@ -299,6 +326,14 @@
raise TypeError("Integer argument expected")
size = size.__index__()
with self._lock:
+ if size < 0:
+ # Shortcut common case - newline found in buffer.
+ i = self._buffer.find(b'\n', self._offset) + 1
+ if i > 0:
+ line = self._buffer[self._offset: i]
+ self._offset = i
+ return line
+
return io.BufferedIOBase.readline(self, size)
def readlines(self, size=-1):
@@ -345,7 +380,8 @@
self._mode = _MODE_READ
self._pos = 0
self._decompressor = BZ2Decompressor()
- self._buffer = None
+ self._buffer = b''
+ self._offset = 0
def seek(self, offset, whence=0):
"""Change the file position.
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com