This is an automated email from the git hooks/post-receive script. tille pushed a commit to branch master in repository python-xopen.
commit 1cbdbf4f69dfb0e144d6e8836044557758ded3e3 Author: Andreas Tille <[email protected]> Date: Sat Feb 10 13:27:16 2018 +0100 New upstream version 0.3.2 --- .travis.yml | 3 +- README.rst | 21 ++-- setup.cfg | 2 - setup.py | 32 +++--- tests/file.txt.bz2 | Bin 71 -> 118 bytes tests/hello.gz | Bin 0 -> 25 bytes tests/{testxopen.py => test_xopen.py} | 58 +++++++++-- tox.ini | 2 +- xopen.py | 188 ++++++++++++++++++---------------- 9 files changed, 189 insertions(+), 117 deletions(-) diff --git a/.travis.yml b/.travis.yml index 15895bb..311b5ae 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,15 +4,14 @@ cache: directories: - $HOME/.cache/pip python: - - "2.6" - "2.7" - "3.3" - "3.4" - "3.5" + - "3.6" install: - pip install . script: - nosetests -P tests - diff --git a/README.rst b/README.rst index 248b9dd..96a4164 100644 --- a/README.rst +++ b/README.rst @@ -8,38 +8,47 @@ xopen ===== -This small Python module provides a ``xopen`` function that works like the +This small Python module provides an ``xopen`` function that works like the built-in ``open`` function, but can also deal with compressed files. Supported compression formats are gzip, bzip2 and xz. They are automatically recognized by their file extensions `.gz`, `.bz2` or `.xz`. The focus is on being as efficient as possible on all supported Python versions. -For example, simply using ``gzip.open`` is slow in older Pythons, and it is -a lot faster to use a ``gzip`` subprocess. +For example, simply using ``gzip.open`` is very slow in older Pythons, and +it is a lot faster to use a ``gzip`` subprocess. For writing to gzip files, +``xopen`` uses ``pigz`` when available. This module has originally been developed as part of the `cutadapt tool <https://cutadapt.readthedocs.io/>`_ that is used in bioinformatics to manipulate sequencing data. It has been in successful use within that software for a few years. +``xopen`` is compatible with Python 2.7, 3.3, 3.4, 3.5 and 3.6. + Usage ----- Open a file for reading:: - with open('file.txt.xz') as f: + from xopen import xopen + + with xopen('file.txt.xz') as f: content = f.read() Or without context manager:: - f = open('file.txt.xz') + from xopen import xopen + + f = xopen('file.txt.xz') content = f.read() f.close() Open a file for writing:: - with open('file.txt.gz', mode='w') as f: + from xopen import xopen + + with xopen('file.txt.gz', mode='w') as f: f.write('Hello') diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 3c6e79c..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[bdist_wheel] -universal=1 diff --git a/setup.py b/setup.py index 13fccc8..ea3ddf1 100644 --- a/setup.py +++ b/setup.py @@ -1,31 +1,37 @@ import sys from setuptools import setup -if sys.version_info < (2, 6): - sys.stdout.write("At least Python 2.6 is required.\n") +if sys.version_info < (2, 7): + sys.stdout.write("At least Python 2.7 is required.\n") sys.exit(1) with open('README.rst') as f: long_description = f.read() +if sys.version_info < (3, ): + requires = ['bz2file'] +else: + requires = [] + setup( - name = 'xopen', - version = '0.1.1', - author = 'Marcel Martin', - author_email = '[email protected]', - url = 'https://github.com/marcelm/xopen/', - description = 'Open compressed files transparently', - long_description = long_description, - license = 'MIT', - py_modules = ['xopen'], - classifiers = [ + name='xopen', + version='0.3.2', + author='Marcel Martin', + author_email='[email protected]', + url='https://github.com/marcelm/xopen/', + description='Open compressed files transparently', + long_description=long_description, + license='MIT', + py_modules=['xopen'], + install_requires=requires, + classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 2.6", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", ] ) diff --git a/tests/file.txt.bz2 b/tests/file.txt.bz2 index 82a5dcc..defbf7d 100644 Binary files a/tests/file.txt.bz2 and b/tests/file.txt.bz2 differ diff --git a/tests/hello.gz b/tests/hello.gz new file mode 100644 index 0000000..73227c4 Binary files /dev/null and b/tests/hello.gz differ diff --git a/tests/testxopen.py b/tests/test_xopen.py similarity index 76% rename from tests/testxopen.py rename to tests/test_xopen.py index c0ba78e..ba04eee 100644 --- a/tests/testxopen.py +++ b/tests/test_xopen.py @@ -7,7 +7,7 @@ import sys import signal from contextlib import contextmanager from nose.tools import raises -from xopen import xopen +from xopen import xopen, PipedGzipReader base = "tests/file.txt" @@ -18,6 +18,10 @@ try: except ImportError: lzma = None +try: + import bz2 +except ImportError: + bz2 = None major, minor = sys.version_info[0:2] @@ -119,19 +123,24 @@ if lzma: def test_append(): - for ext in ["", ".gz"]: # BZ2 does NOT support append - text = "AB" - if ext != "": - text = text.encode("utf-8") # On Py3, need to send BYTES, not unicode + cases = ["", ".gz"] + if bz2 and sys.version_info > (3,): + # BZ2 does NOT support append in Py 2. + cases.append(".bz2") + if lzma: + cases.append(".xz") + for ext in cases: + # On Py3, need to send BYTES, not unicode. Let's do it for all. + text = "AB".encode("utf-8") reference = text + text with temporary_path('truncated.fastq' + ext) as path: try: os.unlink(path) except OSError: pass - with xopen(path, 'a') as f: + with xopen(path, 'ab') as f: f.write(text) - with xopen(path, 'a') as f: + with xopen(path, 'ab') as f: f.write(text) with xopen(path, 'r') as f: for appended in f: @@ -143,6 +152,31 @@ def test_append(): assert appended == reference +def test_append_text(): + cases = ["", ".gz"] + if bz2 and sys.version_info > (3,): + # BZ2 does NOT support append in Py 2. + cases.append(".bz2") + if lzma: + cases.append(".xz") + for ext in cases: # BZ2 does NOT support append + text = "AB" + reference = text + text + with temporary_path('truncated.fastq' + ext) as path: + try: + os.unlink(path) + except OSError: + pass + with xopen(path, 'at') as f: + f.write(text) + with xopen(path, 'at') as f: + f.write(text) + with xopen(path, 'rt') as f: + for appended in f: + pass + assert appended == reference + + def create_truncated_file(path): # Random text random_text = ''.join(random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ') for _ in range(1024)) @@ -195,3 +229,13 @@ if sys.version_info[:2] != (3, 3): for line in f: pass f.close() + + +def test_bare_read_from_gz(): + with xopen('tests/hello.gz', 'rt') as f: + assert f.read() == 'hello' + + +def test_read_piped_gzip(): + with PipedGzipReader('tests/hello.gz', 'rt') as f: + assert f.read() == 'hello' diff --git a/tox.ini b/tox.ini index 43c4de1..d3f5008 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py26,py27,py33,py34,py35 +envlist = py27,py33,py34,py35,py36 [testenv] deps = nose diff --git a/xopen.py b/xopen.py index 114ff16..29cb0c3 100644 --- a/xopen.py +++ b/xopen.py @@ -10,13 +10,18 @@ import os import time from subprocess import Popen, PIPE -_PY3 = sys.version > '3' +__version__ = '0.3.2' -try: - import bz2 -except ImportError: - bz2 = None +_PY3 = sys.version > '3' + +if not _PY3: + import bz2file as bz2 +else: + try: + import bz2 + except ImportError: + bz2 = None try: import lzma @@ -26,29 +31,41 @@ except ImportError: if _PY3: basestring = str -else: - basestring = basestring -if sys.version_info < (2, 7): - buffered_reader = lambda x: x - buffered_writer = lambda x: x -else: - buffered_reader = io.BufferedReader - buffered_writer = io.BufferedWriter +class Closing(object): + """ + Inherit from this class and implement a close() method to offer context + manager functionality. + """ + def __enter__(self): + return self + + def __exit__(self, *exc_info): + self.close() + + def __del__(self): + try: + self.close() + except: + pass -class PipedGzipWriter(object): +class PipedGzipWriter(Closing): """ - Write gzip-compressed files by running an external gzip process and piping - into it. On Python 2, this is faster than using gzip.open. If pigz is - available, that is used instead of gzip. + Write gzip-compressed files by running an external gzip or pigz process and + piping into it. On Python 2, this is faster than using gzip.open(). On + Python 3, it allows to run the compression in a separate process and can + therefore also be faster. """ - def __init__(self, path, mode='w'): + def __init__(self, path, mode='wt'): + if mode not in ('w', 'wt', 'wb', 'a', 'at', 'ab'): + raise ValueError("Mode is '{0}', but it must be 'w', 'wt', 'wb', 'a', 'at' or 'ab'".format(mode)) self.outfile = open(path, mode) - self.devnull = open(os.devnull, 'w') + self.devnull = open(os.devnull, mode) self.closed = False + self.name = path # Setting close_fds to True in the Popen arguments is necessary due to # <http://bugs.python.org/issue12786>. @@ -57,7 +74,7 @@ class PipedGzipWriter(object): self.process = Popen(['pigz'], **kwargs) self.program = 'pigz' except OSError as e: - # binary not found, try regular gzip + # pigz not found, try regular gzip try: self.process = Popen(['gzip'], **kwargs) self.program = 'gzip' @@ -69,29 +86,38 @@ class PipedGzipWriter(object): self.outfile.close() self.devnull.close() raise + if _PY3 and 'b' not in mode: + self._file = io.TextIOWrapper(self.process.stdin) + else: + self._file = self.process.stdin def write(self, arg): - self.process.stdin.write(arg) + self._file.write(arg) def close(self): self.closed = True - self.process.stdin.close() + self._file.close() retcode = self.process.wait() self.outfile.close() self.devnull.close() if retcode != 0: raise IOError("Output {0} process terminated with exit code {1}".format(self.program, retcode)) - def __enter__(self): - return self - def __exit__(self, *exc_info): - self.close() - - -class PipedGzipReader(object): - def __init__(self, path): +class PipedGzipReader(Closing): + def __init__(self, path, mode='r'): + if mode not in ('r', 'rt', 'rb'): + raise ValueError("Mode is '{0}', but it must be 'r', 'rt' or 'rb'".format(mode)) self.process = Popen(['gzip', '-cd', path], stdout=PIPE, stderr=PIPE) + self.name = path + if _PY3 and not 'b' in mode: + self._file = io.TextIOWrapper(self.process.stdout) + else: + self._file = self.process.stdout + if _PY3: + self._stderr = io.TextIOWrapper(self.process.stderr) + else: + self._stderr = self.process.stderr self.closed = False # Give gzip a little bit of time to report any errors (such as # a non-existing file) @@ -107,7 +133,7 @@ class PipedGzipReader(object): self._raise_if_error() def __iter__(self): - for line in self.process.stdout: + for line in self._file: yield line self.process.wait() self._raise_if_error() @@ -119,29 +145,16 @@ class PipedGzipReader(object): """ retcode = self.process.poll() if retcode is not None and retcode != 0: - message = self.process.stderr.read().strip() + message = self._stderr.read().strip() raise IOError(message) def read(self, *args): - data = self.process.stdout.read(*args) + data = self._file.read(*args) if len(args) == 0 or args[0] <= 0: # wait for process to terminate until we check the exit code self.process.wait() self._raise_if_error() - - def __enter__(self): - return self - - def __exit__(self, *exc_info): - self.close() - - -class Closing(object): - def __enter__(self): - return self - - def __exit__(self, *exc_info): - self.close() + return data if bz2 is not None: @@ -152,7 +165,7 @@ if bz2 is not None: """ -def xopen(filename, mode='r'): +def xopen(filename, mode='r', compresslevel=6): """ Replacement for the "open" function that can also open files that have been compressed with gzip, bzip2 or xz. If the filename is '-', standard @@ -162,18 +175,20 @@ def xopen(filename, mode='r'): the pipe to the gzip program). If the filename ends with .bz2, it's opened as a bz2.BZ2File. Otherwise, the regular open() is used. - mode can be: 'rt', 'rb', 'a', 'wt', or 'wb' - Instead of 'rt' and 'wt', 'r' and 'w' can be used as abbreviations. + mode can be: 'rt', 'rb', 'at', 'ab', 'wt', or 'wb' + Instead of 'rt', 'wt' and 'at', 'r', 'w' and 'a' can be used as + abbreviations. In Python 2, the 't' and 'b' characters are ignored. - Append mode ('a') is unavailable with BZ2 compression and will raise an error. + Append mode ('a', 'at', 'ab') is unavailable with BZ2 compression and + will raise an error. + + compresslevel is the gzip compression level. It is not used for bz2 and xz. """ - if mode == 'r': - mode = 'rt' - elif mode == 'w': - mode = 'wt' - if mode not in ('rt', 'rb', 'wt', 'wb', 'a'): + if mode in ('r', 'w', 'a'): + mode += 't' + if mode not in ('rt', 'rb', 'wt', 'wb', 'at', 'ab'): raise ValueError("mode '{0}' not supported".format(mode)) if not _PY3: mode = mode[0] @@ -182,52 +197,53 @@ def xopen(filename, mode='r'): # standard input and standard output handling if filename == '-': - if not _PY3: - return sys.stdin if 'r' in mode else sys.stdout return dict( + r=sys.stdin, rt=sys.stdin, - wt=sys.stdout, rb=sys.stdin.buffer, + w=sys.stdout, + wt=sys.stdout, wb=sys.stdout.buffer)[mode] if filename.endswith('.bz2'): if bz2 is None: raise ImportError("Cannot open bz2 files: The bz2 module is not available") if _PY3: - if 't' in mode: - return io.TextIOWrapper(bz2.BZ2File(filename, mode[0])) + return bz2.open(filename, mode) + else: + if mode[0] == 'a': + raise ValueError("mode '{0}' not supported with BZ2 compression".format(mode)) + if sys.version_info[:2] <= (2, 6): + return ClosingBZ2File(filename, mode) else: return bz2.BZ2File(filename, mode) - elif sys.version_info[:2] <= (2, 6): - return ClosingBZ2File(filename, mode) - else: - return bz2.BZ2File(filename, mode) elif filename.endswith('.xz'): if lzma is None: raise ImportError("Cannot open xz files: The lzma module is not available (use Python 3.3 or newer)") return lzma.open(filename, mode) elif filename.endswith('.gz'): - if _PY3: - if 't' in mode: - # gzip.open in Python 3.2 does not support modes 'rt' and 'wt'' - return io.TextIOWrapper(gzip.open(filename, mode[0])) - else: - if 'r' in mode: - return io.BufferedReader(gzip.open(filename, mode)) - else: - return io.BufferedWriter(gzip.open(filename, mode)) + if _PY3 and 'r' in mode: + return gzip.open(filename, mode) + if sys.version_info[:2] == (2, 7): + buffered_reader = io.BufferedReader + buffered_writer = io.BufferedWriter else: - # rb/rt are equivalent in Py2 - if 'r' in mode: - try: - return PipedGzipReader(filename) - except OSError: - # gzip not installed - return buffered_reader(gzip.open(filename, mode)) - else: - try: - return PipedGzipWriter(filename, mode) - except OSError: - return buffered_writer(gzip.open(filename, mode)) + buffered_reader = lambda x: x + buffered_writer = lambda x: x + if 'r' in mode: + try: + return PipedGzipReader(filename, mode) + except OSError: + # gzip not installed + return buffered_reader(gzip.open(filename, mode)) + else: + try: + return PipedGzipWriter(filename, mode) + except OSError: + return buffered_writer(gzip.open(filename, mode, compresslevel=compresslevel)) else: + # Python 2.6 and 2.7 have io.open, which we could use to make the returned + # object consistent with the one returned in Python 3, but reading a file + # with io.open() is 100 times slower (!) on Python 2.6, and still about + # three times slower on Python 2.7 (tested with "for _ in io.open(path): pass") return open(filename, mode) -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-xopen.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
