Serhiy Storchaka added the comment:

Patch updated. Fixed one error. Now readline() optimized too.

Benchmark results (reading python.bz2):

Py2.7   Py3.2   Py3.3   Py3.3
                vanilla patched
4.8     4.8     -       31      read(1)
1       0.94    3.4e+02 3.6     read(10)
0.61    0.6     28      0.87    read(100)
0.58    0.58    3.4     0.61    read(1000)
0.59    0.57    0.88    0.58    read(10000)
0.57    0.56    0.62    0.58    read(100000)
0.57    0.57    0.59    0.58    read()
-       -       -       30      read1(1)
-       -       3.2e+02 3.6     read1(10)
-       -       27      0.88    read1(100)
-       -       3.3     0.61    read1(1000)
-       -       0.88    0.58    read1(10000)
-       -       0.61    0.57    read1(100000)
-       -       0.58    0.57    read1()
1.7     1.7     11      0.67    readline()

----------
Added file: http://bugs.python.org/file27314/bz2_faster_read.patch

_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue16034>
_______________________________________
diff -r 6f456d9add40 Lib/bz2.py
--- a/Lib/bz2.py        Wed Sep 26 13:11:48 2012 +0200
+++ b/Lib/bz2.py        Wed Sep 26 16:49:44 2012 +0300
@@ -79,7 +79,8 @@
             mode = "rb"
             mode_code = _MODE_READ
             self._decompressor = BZ2Decompressor()
-            self._buffer = None
+            self._buffer = b''
+            self._offset = 0
         elif mode in ("w", "wb"):
             mode = "wb"
             mode_code = _MODE_WRITE
@@ -124,7 +125,8 @@
                     self._fp = None
                     self._closefp = False
                     self._mode = _MODE_CLOSED
-                    self._buffer = None
+                    self._buffer = b''
+                    self._offset = 0
 
     @property
     def closed(self):
@@ -172,14 +174,14 @@
             raise io.UnsupportedOperation("The underlying file object "
                                           "does not support seeking")
 
-    # Fill the readahead buffer if it is empty. Returns False on EOF.
-    def _fill_buffer(self):
+    # Non-buffered read and decompress next chunk of data.
+    # Always returns at least one byte of data, unless at EOF.
+    def _read1(self):
         # Depending on the input data, our call to the decompressor may not
         # return any data. In this case, try again after reading another block.
+        if self._mode == _MODE_READ_EOF:
+            return b''
         while True:
-            if self._buffer:
-                return True
-
             if self._decompressor.unused_data:
                 rawblock = self._decompressor.unused_data
             else:
@@ -189,48 +191,70 @@
                 if self._decompressor.eof:
                     self._mode = _MODE_READ_EOF
                     self._size = self._pos
-                    return False
+                    return b''
                 else:
                     raise EOFError("Compressed file ended before the "
-                                   "end-of-stream marker was reached")
+                                    "end-of-stream marker was reached")
 
             # Continue to next stream.
             if self._decompressor.eof:
                 self._decompressor = BZ2Decompressor()
 
-            self._buffer = self._decompressor.decompress(rawblock)
+            data = self._decompressor.decompress(rawblock)
+            if data:
+                return data
 
     # Read data until EOF.
     # If return_data is false, consume the data without returning it.
     def _read_all(self, return_data=True):
+        data = self._buffer[self._offset:]
         blocks = []
-        while self._fill_buffer():
+        self._buffer = b''
+        self._offset = 0
+        while True:
             if return_data:
-                blocks.append(self._buffer)
-            self._pos += len(self._buffer)
-            self._buffer = None
+                blocks.append(data)
+            self._pos += len(data)
+            data = self._read1()
+            if not data:
+                break
         if return_data:
             return b"".join(blocks)
 
     # Read a block of up to n bytes.
     # If return_data is false, consume the data without returning it.
     def _read_block(self, n, return_data=True):
+        if n <= 0:
+            return b''
+        end = n + self._offset
+        data = self._buffer[self._offset:end]
+        if end <= len(self._buffer):
+            self._offset = end
+            self._pos += len(data)
+            return data
+
         blocks = []
-        while n > 0 and self._fill_buffer():
-            if n < len(self._buffer):
-                data = self._buffer[:n]
-                self._buffer = self._buffer[n:]
-            else:
-                data = self._buffer
-                self._buffer = None
+        self._buffer = b''
+        self._offset = 0
+        while True:
             if return_data:
                 blocks.append(data)
             self._pos += len(data)
             n -= len(data)
+            if not n:
+                break
+            data = self._read1()
+            if not data:
+                break
+            if n < len(data):
+                self._buffer = data
+                self._offset = n
+                data = data[:n]
+
         if return_data:
             return b"".join(blocks)
 
-    def peek(self, n=0):
+    def peek(self, n=1):
         """Return buffered data without advancing the file position.
 
         Always returns at least one byte of data, unless at EOF.
@@ -238,9 +262,11 @@
         """
         with self._lock:
             self._check_can_read()
-            if self._mode == _MODE_READ_EOF or not self._fill_buffer():
-                return b""
-            return self._buffer
+            data = self._buffer[self._offset:]
+            if not data:
+                self._buffer = data = self._read1()
+                self._offset = 0
+            return data
 
     def read(self, size=-1):
         """Read up to size uncompressed bytes from the file.
@@ -250,9 +276,7 @@
         """
         with self._lock:
             self._check_can_read()
-            if self._mode == _MODE_READ_EOF or size == 0:
-                return b""
-            elif size < 0:
+            if size < 0:
                 return self._read_all()
             else:
                 return self._read_block(size)
@@ -268,15 +292,18 @@
         # In this case we make multiple reads, to avoid returning b"".
         with self._lock:
             self._check_can_read()
-            if (size == 0 or self._mode == _MODE_READ_EOF or
-                not self._fill_buffer()):
+            if size == 0:
                 return b""
-            if 0 < size < len(self._buffer):
-                data = self._buffer[:size]
-                self._buffer = self._buffer[size:]
+            if self._offset == len(self._buffer):
+                self._buffer = self._read1()
+                self._offset = 0
+            if size > 0:
+                data = self._buffer[self._offset:self._offset + size]
+                self._offset += len(data)
             else:
-                data = self._buffer
-                self._buffer = None
+                data = self._buffer[self._offset:]
+                self._buffer = b''
+                self._offset = 0
             self._pos += len(data)
             return data
 
@@ -299,6 +326,14 @@
             raise TypeError("Integer argument expected")
         size = size.__index__()
         with self._lock:
+            if size < 0:
+                # Shortcut common case - newline found in buffer.
+                i = self._buffer.find(b'\n', self._offset) + 1
+                if i > 0:
+                    line = self._buffer[self._offset: i]
+                    self._offset = i
+                    return line
+
             return io.BufferedIOBase.readline(self, size)
 
     def readlines(self, size=-1):
@@ -345,7 +380,8 @@
         self._mode = _MODE_READ
         self._pos = 0
         self._decompressor = BZ2Decompressor()
-        self._buffer = None
+        self._buffer = b''
+        self._offset = 0
 
     def seek(self, offset, whence=0):
         """Change the file position.
_______________________________________________
Python-bugs-list mailing list
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

Reply via email to