On Wed, Apr 03, 2013 at 04:27:43PM +0200, Helmut Grohne wrote: > So maybe we can work on a solution here? I'll try to patch arpy to > support this use case. Give me a week?
Turned out to be easier than expected. Maybe less arguing and more fixing would have helped here. The attached patch makes reading from stdin possible. It has one major drawback the moment. When your backing fileobj does support seeking it still emulates forward seeks using reads, because there is no way to reliably detect the availability of seek. So accessing random members of an Archive could result in very bad performance (especially if combined with bz2). In order to fix that the user has to provide some indication whether seek is desired. When using Archive without seek only the current ArchiveFileData may be read from start till end once. To ease working with the Archive class I turned it into an iterator yielding ArchiveFileDatas. When using this iterator forward seeks are almost never necessary. You can omit it, but I still think it to be useful. Helmut
diff -r 67ef59afde76 arpy.py --- a/arpy.py Sun Mar 24 01:52:07 2013 +0000 +++ b/arpy.py Wed Apr 03 17:40:11 2013 +0200 @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # # Copyright 2011 StanisÅaw Pitucha. All rights reserved. +# Copyright 2013 Helmut Grohne. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are # permitted provided that the following conditions are met: @@ -127,12 +128,12 @@ class ArchiveFileData(object): """ File-like object used for reading an archived file """ - def __init__(self, file_obj, header): + def __init__(self, ar_obj, header): """ Creates a new proxy for the archived file, reusing the archive's file descriptor """ self.header = header - self.file = file_obj + self.arobj = ar_obj self.last_offset = 0 def read(self, size = None): @@ -143,8 +144,8 @@ if self.header.size < self.last_offset + size: size = self.header.size - self.last_offset - self.file.seek(self.header.file_offset + self.last_offset) - data = self.file.read(size) + self.arobj._seek(self.header.file_offset + self.last_offset) + data = self.arobj._read(size) if len(data) < size: raise ArchiveAccessError("incorrect archive file") @@ -175,18 +176,38 @@ def __init__(self, filename=None, fileobj=None): self.headers = [] self.file = fileobj or open(filename, "rb") - if self.file.read(GLOBAL_HEADER_LEN) != b"!<arch>\n": + self.position = 0 + if self._read(GLOBAL_HEADER_LEN) != b"!<arch>\n": raise ArchiveFormatError("file is missing the global header") self.next_header_offset = GLOBAL_HEADER_LEN self.gnu_table = None self.archived_files = {} + def _read(self, length): + data = self.file.read(length) + self.position += len(data) + return data + + def _seek(self, offset): + if offset < 0: + raise ArchiveAccessError("incorrect file position") + if offset < self.position: + # seek required, might fail + self.file.seek(offset) + self.position = self.file.tell() + else: + # emulate seek + while self.position < offset: + if not self._read(min(4096, offset - self.position)): + # reached EOF before target offset + return + def __read_file_header(self, offset): """ Reads and returns a single new file header """ - self.file.seek(offset) + self._seek(offset) - header = self.file.read(HEADER_LEN) + header = self._read(HEADER_LEN) if len(header) == 0: return None @@ -208,7 +229,7 @@ def __read_gnu_table(self, size): """ Reads the table of filenames specific to GNU ar format """ - table_string = self.file.read(size) + table_string = self._read(size) if len(table_string) != size: raise ArchiveFormatError("file too short to fit the names table") @@ -234,8 +255,8 @@ # BSD format includes the filename in the file size header.size -= filename_len - self.file.seek(header.offset + HEADER_LEN) - header.name = self.file.read(filename_len) + self._seek(header.offset + HEADER_LEN) + header.name = self._read(filename_len) return filename_len elif header.type == HEADER_GNU_TABLE: @@ -274,10 +295,22 @@ if header is not None: self.headers.append(header) if header.type in (HEADER_BSD, HEADER_NORMAL, HEADER_GNU): - self.archived_files[header.name] = ArchiveFileData(self.file, header) + self.archived_files[header.name] = ArchiveFileData(self, header) return header + def __next__(self): + while True: + header = self.read_next_header() + if header is None: + raise StopIteration + if header.type in (HEADER_BSD, HEADER_NORMAL, HEADER_GNU): + return self.archived_files[header.name] + next = __next__ + + def __iter__(self): + return self + def read_all_headers(self): """ Reads all headers """ while self.read_next_header() is not None: