Serhiy Storchaka added the comment:
Here is a patch and benchmark script. This required more time than I thought.
Benchmark results:
Unpatched:
5.3 read(1)
0.5 read(10)
0.049 read(100)
0.013 read(1000)
0.009 read(10000)
0.0085 read(100000)
0.0082 read()
5 read1(1)
0.47 read1(10)
0.046 read1(100)
0.012 read1(1000)
0.0089 read1(10000)
0.0084 read1(100000)
0.0082 read1()
0.15 readline()
Patched:
0.73 read(1)
0.082 read(10)
0.015 read(100)
0.0089 read(1000)
0.0082 read(10000)
0.0084 read(100000)
0.0083 read()
0.78 read1(1)
0.087 read1(10)
0.016 read1(100)
0.0089 read1(1000)
0.0082 read1(10000)
0.0082 read1(100000)
0.008 read1()
0.14 readline()
----------
keywords: +patch
Added file: http://bugs.python.org/file27310/bz2_faster_read.patch
Added file: http://bugs.python.org/file27311/bz2bench.py
_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue16034>
_______________________________________
diff -r 82f25e941be2 Lib/bz2.py
--- a/Lib/bz2.py Tue Sep 25 12:34:54 2012 -0700
+++ b/Lib/bz2.py Wed Sep 26 12:34:43 2012 +0300
@@ -79,7 +79,8 @@
mode = "rb"
mode_code = _MODE_READ
self._decompressor = BZ2Decompressor()
- self._buffer = None
+ self._buffer = b''
+ self._offset = 0
elif mode in ("w", "wb"):
mode = "wb"
mode_code = _MODE_WRITE
@@ -124,7 +125,8 @@
self._fp = None
self._closefp = False
self._mode = _MODE_CLOSED
- self._buffer = None
+ self._buffer = b''
+ self._offset = 0
@property
def closed(self):
@@ -172,14 +174,14 @@
raise io.UnsupportedOperation("The underlying file object "
"does not support seeking")
- # Fill the readahead buffer if it is empty. Returns False on EOF.
- def _fill_buffer(self):
+ # Non-buffered read and decompress next chunk of data.
+ # Always returns at least one byte of data, unless at EOF.
+ def _read1(self):
# Depending on the input data, our call to the decompressor may not
# return any data. In this case, try again after reading another block.
+ if self._mode == _MODE_READ_EOF:
+ return b''
while True:
- if self._buffer:
- return True
-
if self._decompressor.unused_data:
rawblock = self._decompressor.unused_data
else:
@@ -189,48 +191,70 @@
if self._decompressor.eof:
self._mode = _MODE_READ_EOF
self._size = self._pos
- return False
+ return b''
else:
raise EOFError("Compressed file ended before the "
- "end-of-stream marker was reached")
+ "end-of-stream marker was reached")
# Continue to next stream.
if self._decompressor.eof:
self._decompressor = BZ2Decompressor()
- self._buffer = self._decompressor.decompress(rawblock)
+ data = self._decompressor.decompress(rawblock)
+ if data:
+ return data
# Read data until EOF.
# If return_data is false, consume the data without returning it.
def _read_all(self, return_data=True):
+ data = self._buffer[self._offset:]
blocks = []
- while self._fill_buffer():
+ self._buffer = b''
+ self._offset = 0
+ while True:
if return_data:
- blocks.append(self._buffer)
- self._pos += len(self._buffer)
- self._buffer = None
+ blocks.append(data)
+ self._pos += len(data)
+ data = self._read1()
+ if not data:
+ break
if return_data:
return b"".join(blocks)
# Read a block of up to n bytes.
# If return_data is false, consume the data without returning it.
def _read_block(self, n, return_data=True):
+ if n <= 0:
+ return b''
+ end = n + self._offset
+ data = self._buffer[self._offset:end]
+ if data:
+ self._offset = end
+ self._pos += len(data)
+ return data
+
blocks = []
- while n > 0 and self._fill_buffer():
- if n < len(self._buffer):
- data = self._buffer[:n]
- self._buffer = self._buffer[n:]
- else:
- data = self._buffer
- self._buffer = None
+ self._buffer = b''
+ self._offset = 0
+ while True:
if return_data:
blocks.append(data)
self._pos += len(data)
n -= len(data)
+ if not n:
+ break
+ data = self._read1()
+ if not data:
+ break
+ if n < len(data):
+ self._buffer = data
+ self._offset = n
+ data = data[:n]
+
if return_data:
return b"".join(blocks)
- def peek(self, n=0):
+ def peek(self, n=1):
"""Return buffered data without advancing the file position.
Always returns at least one byte of data, unless at EOF.
@@ -238,9 +262,11 @@
"""
with self._lock:
self._check_can_read()
- if self._mode == _MODE_READ_EOF or not self._fill_buffer():
- return b""
- return self._buffer
+ data = self._buffer[self._offset:]
+ if not data:
+ self._buffer = data = self._read1()
+ self._offset = 0
+ return data
def read(self, size=-1):
"""Read up to size uncompressed bytes from the file.
@@ -250,9 +276,7 @@
"""
with self._lock:
self._check_can_read()
- if self._mode == _MODE_READ_EOF or size == 0:
- return b""
- elif size < 0:
+ if size < 0:
return self._read_all()
else:
return self._read_block(size)
@@ -268,15 +292,18 @@
# In this case we make multiple reads, to avoid returning b"".
with self._lock:
self._check_can_read()
- if (size == 0 or self._mode == _MODE_READ_EOF or
- not self._fill_buffer()):
+ if size == 0:
return b""
- if 0 < size < len(self._buffer):
- data = self._buffer[:size]
- self._buffer = self._buffer[size:]
+ if self._offset == len(self._buffer):
+ self._buffer = self._read1()
+ self._offset = 0
+ if size > 0:
+ data = self._buffer[self._offset:self._offset + size]
+ self._offset += len(data)
else:
- data = self._buffer
- self._buffer = None
+ data = self._buffer[self._offset:]
+ self._buffer = b''
+ self._offset = 0
self._pos += len(data)
return data
@@ -345,7 +372,8 @@
self._mode = _MODE_READ
self._pos = 0
self._decompressor = BZ2Decompressor()
- self._buffer = None
+ self._buffer = b''
+ self._offset = 0
def seek(self, offset, whence=0):
"""Change the file position.
# -*- coding: utf-8 -*-
import sys, bz2, timeit
fn = sys.argv[1]
min_t = 2 if len(sys.argv) <= 2 else float(sys.argv[2])
benchs = []
for i in (1, 10, 100, 1000, 10000, 100000):
benchs.append('read(%d)' % i)
benchs.append('read()')
for i in (1, 10, 100, 1000, 10000, 100000):
benchs.append('read1(%d)' % i)
benchs.append('read1()')
benchs.append('readline()')
for s in benchs:
stmt = 'with bz2.BZ2File(%r) as f:\n while f.%s: pass' % (fn, s)
n = 1
while True:
t = timeit.timeit(stmt, 'import bz2', number=n)
if t >= min_t:
break
n = min(int(n * min_t / t * 1.1) + 1, 10 * n)
print('%.2g\t%s' % (t / n, s))
sys.stdout.flush()
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com