Serhiy Storchaka added the comment:

Here is a patch and benchmark script. This required more time than I thought.

Benchmark results:

Unpatched:

5.3     read(1)
0.5     read(10)
0.049   read(100)
0.013   read(1000)
0.009   read(10000)
0.0085  read(100000)
0.0082  read()
5       read1(1)
0.47    read1(10)
0.046   read1(100)
0.012   read1(1000)
0.0089  read1(10000)
0.0084  read1(100000)
0.0082  read1()
0.15    readline()

Patched:

0.73    read(1)
0.082   read(10)
0.015   read(100)
0.0089  read(1000)
0.0082  read(10000)
0.0084  read(100000)
0.0083  read()
0.78    read1(1)
0.087   read1(10)
0.016   read1(100)
0.0089  read1(1000)
0.0082  read1(10000)
0.0082  read1(100000)
0.008   read1()
0.14    readline()

----------
keywords: +patch
Added file: http://bugs.python.org/file27310/bz2_faster_read.patch
Added file: http://bugs.python.org/file27311/bz2bench.py

_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue16034>
_______________________________________
diff -r 82f25e941be2 Lib/bz2.py
--- a/Lib/bz2.py        Tue Sep 25 12:34:54 2012 -0700
+++ b/Lib/bz2.py        Wed Sep 26 12:34:43 2012 +0300
@@ -79,7 +79,8 @@
             mode = "rb"
             mode_code = _MODE_READ
             self._decompressor = BZ2Decompressor()
-            self._buffer = None
+            self._buffer = b''
+            self._offset = 0
         elif mode in ("w", "wb"):
             mode = "wb"
             mode_code = _MODE_WRITE
@@ -124,7 +125,8 @@
                     self._fp = None
                     self._closefp = False
                     self._mode = _MODE_CLOSED
-                    self._buffer = None
+                    self._buffer = b''
+                    self._offset = 0
 
     @property
     def closed(self):
@@ -172,14 +174,14 @@
             raise io.UnsupportedOperation("The underlying file object "
                                           "does not support seeking")
 
-    # Fill the readahead buffer if it is empty. Returns False on EOF.
-    def _fill_buffer(self):
+    # Non-buffered read and decompress next chunk of data.
+    # Always returns at least one byte of data, unless at EOF.
+    def _read1(self):
         # Depending on the input data, our call to the decompressor may not
         # return any data. In this case, try again after reading another block.
+        if self._mode == _MODE_READ_EOF:
+            return b''
         while True:
-            if self._buffer:
-                return True
-
             if self._decompressor.unused_data:
                 rawblock = self._decompressor.unused_data
             else:
@@ -189,48 +191,70 @@
                 if self._decompressor.eof:
                     self._mode = _MODE_READ_EOF
                     self._size = self._pos
-                    return False
+                    return b''
                 else:
                     raise EOFError("Compressed file ended before the "
-                                   "end-of-stream marker was reached")
+                                    "end-of-stream marker was reached")
 
             # Continue to next stream.
             if self._decompressor.eof:
                 self._decompressor = BZ2Decompressor()
 
-            self._buffer = self._decompressor.decompress(rawblock)
+            data = self._decompressor.decompress(rawblock)
+            if data:
+                return data
 
     # Read data until EOF.
     # If return_data is false, consume the data without returning it.
     def _read_all(self, return_data=True):
+        data = self._buffer[self._offset:]
         blocks = []
-        while self._fill_buffer():
+        self._buffer = b''
+        self._offset = 0
+        while True:
             if return_data:
-                blocks.append(self._buffer)
-            self._pos += len(self._buffer)
-            self._buffer = None
+                blocks.append(data)
+            self._pos += len(data)
+            data = self._read1()
+            if not data:
+                break
         if return_data:
             return b"".join(blocks)
 
     # Read a block of up to n bytes.
     # If return_data is false, consume the data without returning it.
     def _read_block(self, n, return_data=True):
+        if n <= 0:
+            return b''
+        end = n + self._offset
+        data = self._buffer[self._offset:end]
+        if data:
+            self._offset = end
+            self._pos += len(data)
+            return data
+
         blocks = []
-        while n > 0 and self._fill_buffer():
-            if n < len(self._buffer):
-                data = self._buffer[:n]
-                self._buffer = self._buffer[n:]
-            else:
-                data = self._buffer
-                self._buffer = None
+        self._buffer = b''
+        self._offset = 0
+        while True:
             if return_data:
                 blocks.append(data)
             self._pos += len(data)
             n -= len(data)
+            if not n:
+                break
+            data = self._read1()
+            if not data:
+                break
+            if n < len(data):
+                self._buffer = data
+                self._offset = n
+                data = data[:n]
+
         if return_data:
             return b"".join(blocks)
 
-    def peek(self, n=0):
+    def peek(self, n=1):
         """Return buffered data without advancing the file position.
 
         Always returns at least one byte of data, unless at EOF.
@@ -238,9 +262,11 @@
         """
         with self._lock:
             self._check_can_read()
-            if self._mode == _MODE_READ_EOF or not self._fill_buffer():
-                return b""
-            return self._buffer
+            data = self._buffer[self._offset:]
+            if not data:
+                self._buffer = data = self._read1()
+                self._offset = 0
+            return data
 
     def read(self, size=-1):
         """Read up to size uncompressed bytes from the file.
@@ -250,9 +276,7 @@
         """
         with self._lock:
             self._check_can_read()
-            if self._mode == _MODE_READ_EOF or size == 0:
-                return b""
-            elif size < 0:
+            if size < 0:
                 return self._read_all()
             else:
                 return self._read_block(size)
@@ -268,15 +292,18 @@
         # In this case we make multiple reads, to avoid returning b"".
         with self._lock:
             self._check_can_read()
-            if (size == 0 or self._mode == _MODE_READ_EOF or
-                not self._fill_buffer()):
+            if size == 0:
                 return b""
-            if 0 < size < len(self._buffer):
-                data = self._buffer[:size]
-                self._buffer = self._buffer[size:]
+            if self._offset == len(self._buffer):
+                self._buffer = self._read1()
+                self._offset = 0
+            if size > 0:
+                data = self._buffer[self._offset:self._offset + size]
+                self._offset += len(data)
             else:
-                data = self._buffer
-                self._buffer = None
+                data = self._buffer[self._offset:]
+                self._buffer = b''
+                self._offset = 0
             self._pos += len(data)
             return data
 
@@ -345,7 +372,8 @@
         self._mode = _MODE_READ
         self._pos = 0
         self._decompressor = BZ2Decompressor()
-        self._buffer = None
+        self._buffer = b''
+        self._offset = 0
 
     def seek(self, offset, whence=0):
         """Change the file position.
# -*- coding: utf-8 -*-
import sys, bz2, timeit

fn = sys.argv[1]
min_t = 2 if len(sys.argv) <= 2 else float(sys.argv[2])

benchs = []
for i in (1, 10, 100, 1000, 10000, 100000):
    benchs.append('read(%d)' % i)
benchs.append('read()')
for i in (1, 10, 100, 1000, 10000, 100000):
    benchs.append('read1(%d)' % i)
benchs.append('read1()')
benchs.append('readline()')

for s in benchs:
    stmt = 'with bz2.BZ2File(%r) as f:\n  while f.%s: pass' % (fn, s)
    n = 1
    while True:
        t = timeit.timeit(stmt, 'import bz2', number=n)
        if t >= min_t:
            break
        n = min(int(n * min_t / t * 1.1) + 1, 10 * n)
    print('%.2g\t%s' % (t / n, s))
    sys.stdout.flush()
_______________________________________________
Python-bugs-list mailing list
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

Reply via email to