Very nice. I am wondering how far this can be taken. TIFF's LZW uses an end-of-information code after the clear code, allows both little-endian and big-endian packing of bytes, and increases the code size one step too early. Is it even possible to generalize LZW (any further), and would it be a good idea, given how uniquely it's implemented in each file format?
Damjan On Sun, Dec 15, 2013 at 5:23 PM, <bode...@apache.org> wrote: > Author: bodewig > Date: Sun Dec 15 15:23:35 2013 > New Revision: 1551026 > > URL: http://svn.apache.org/r1551026 > Log: > reduce code duplication in the two LZW input streams > > Added: > > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/z/AbstractLZWInputStream.java > (with props) > Modified: > > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnshrinkingInputStream.java > > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/z/ZCompressorInputStream.java > > Modified: > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnshrinkingInputStream.java > URL: > http://svn.apache.org/viewvc/commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnshrinkingInputStream.java?rev=1551026&r1=1551025&r2=1551026&view=diff > ============================================================================== > --- > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnshrinkingInputStream.java > (original) > +++ > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/archivers/zip/UnshrinkingInputStream.java > Sun Dec 15 15:23:35 2013 > @@ -21,84 +21,43 @@ package org.apache.commons.compress.arch > import java.io.IOException; > import java.io.InputStream; > > -import org.apache.commons.compress.compressors.CompressorInputStream; > +import org.apache.commons.compress.compressors.z.AbstractLZWInputStream; > > /** > * Input stream that decompresses ZIP method 1 (unshrinking). A variation of > the LZW algorithm, with some twists. > * @NotThreadSafe > * @since 1.7 > */ > -class UnshrinkingInputStream extends CompressorInputStream { > - private final InputStream in; > - private final int clearCode; > - private final int MAX_CODE_SIZE = 13; > - private int codeSize = 9; > - private int bitsCached = 0; > - private int bitsCachedSize = 0; > - private int previousCode = -1; > - private int tableSize = 0; > - private final int[] prefixes; > - private final byte[] characters; > +class UnshrinkingInputStream extends AbstractLZWInputStream { > + private static final int MAX_CODE_SIZE = 13; > + private static final int MAX_TABLE_SIZE = 1 << MAX_CODE_SIZE; > private final boolean[] isUsed; > - private final byte[] outputStack; > - private int outputStackLocation; > > public UnshrinkingInputStream(InputStream inputStream) throws > IOException { > - this.in = inputStream; > - clearCode = (1 << (codeSize - 1)); > - final int maxTableSize = 1 << MAX_CODE_SIZE; > - prefixes = new int[maxTableSize]; > - characters = new byte[maxTableSize]; > - isUsed = new boolean[maxTableSize]; > - outputStack = new byte[maxTableSize]; > - outputStackLocation = maxTableSize; > + super(inputStream); > + setClearCode(codeSize); > + initializeTables(MAX_CODE_SIZE); > + isUsed = new boolean[prefixes.length]; > for (int i = 0; i < (1 << 8); i++) { > - prefixes[i] = -1; > - characters[i] = (byte)i; > isUsed[i] = true; > } > tableSize = clearCode + 1; > } > - > - public void close() throws IOException { > - in.close(); > - } > - > - private int readNextCode() throws IOException { > - while (bitsCachedSize < codeSize) { > - final int nextByte = in.read(); > - if (nextByte < 0) { > - return nextByte; > - } > - bitsCached |= (nextByte << bitsCachedSize); > - bitsCachedSize += 8; > - } > - final int mask = (1 << codeSize) - 1; > - final int code = (bitsCached & mask); > - bitsCached >>>= codeSize; > - bitsCachedSize -= codeSize; > - return code; > - } > - > - private int addEntry(int previousCode, byte character) throws > IOException { > - final int maxTableSize = 1 << MAX_CODE_SIZE; > - while ((tableSize < maxTableSize) && isUsed[tableSize]) { > + > + @Override > + protected int addEntry(int previousCode, byte character) throws > IOException { > + while ((tableSize < MAX_TABLE_SIZE) && isUsed[tableSize]) { > tableSize++; > } > - if (tableSize < maxTableSize) { > - final int index = tableSize; > - prefixes[tableSize] = previousCode; > - characters[tableSize] = character; > - isUsed[tableSize] = true; > - tableSize++; > - return index; > - } else { > - return -1; > + int idx = addEntry(previousCode, character, MAX_TABLE_SIZE); > + if (idx >= 0) { > + isUsed[idx] = true; > } > + return idx; > } > > private void partialClear() throws IOException { > - final boolean[] isParent = new boolean[1 << MAX_CODE_SIZE]; > + final boolean[] isParent = new boolean[MAX_TABLE_SIZE]; > for (int i = 0; i < isUsed.length; i++) { > if (isUsed[i] && prefixes[i] != -1) { > isParent[prefixes[i]] = true; > @@ -112,7 +71,8 @@ class UnshrinkingInputStream extends Com > } > } > > - private int decompressNextSymbol() throws IOException { > + @Override > + protected int decompressNextSymbol() throws IOException { > // > // table entry table entry > // _____________ _____ > @@ -147,76 +107,12 @@ class UnshrinkingInputStream extends Com > return 0; > } else { > boolean addedUnfinishedEntry = false; > - final int effectiveCode; > - if (isUsed[code]) { > - effectiveCode = code; > - } else { > - // must be a repeat of the previous entry we haven't added > yet > - if (previousCode == -1) { > - // ... which isn't possible for the very first code > - throw new IOException("The first code can't be a > reference to its preceding code"); > - } > - byte firstCharacter = 0; > - for (int last = previousCode; last >= 0; last = > prefixes[last]) { > - firstCharacter = characters[last]; > - } > - effectiveCode = addEntry(previousCode, firstCharacter); > + int effectiveCode = code; > + if (!isUsed[code]) { > + effectiveCode = addRepeatOfPreviousCode(); > addedUnfinishedEntry = true; > } > - for (int entry = effectiveCode; entry >= 0; entry = > prefixes[entry]) { > - outputStack[--outputStackLocation] = characters[entry]; > - } > - if (previousCode != -1 && !addedUnfinishedEntry) { > - addEntry(previousCode, outputStack[outputStackLocation]); > - } > - previousCode = code; > - return outputStackLocation; > - } > - } > - > - public int read() throws IOException { > - byte[] b = new byte[1]; > - int ret; > - while ((ret = read(b)) == 0) { > - } > - if (ret < 0) { > - return ret; > - } > - return 0xff & b[0]; > - } > - > - public int read(byte[] b, int off, int len) throws IOException { > - int bytesRead = 0; > - int remainingInStack = outputStack.length - outputStackLocation; > - if (remainingInStack > 0) { > - int maxLength = Math.min(remainingInStack, len); > - System.arraycopy(outputStack, outputStackLocation, b, off, > maxLength); > - outputStackLocation += maxLength; > - off += maxLength; > - len -= maxLength; > - bytesRead += maxLength; > - } > - while (len > 0) { > - int result = decompressNextSymbol(); > - if (result < 0) { > - if (bytesRead > 0) { > - count(bytesRead); > - return bytesRead; > - } else { > - return result; > - } > - } > - remainingInStack = outputStack.length - outputStackLocation; > - if (remainingInStack > 0) { > - int maxLength = Math.min(remainingInStack, len); > - System.arraycopy(outputStack, outputStackLocation, b, off, > maxLength); > - outputStackLocation += maxLength; > - off += maxLength; > - len -= maxLength; > - bytesRead += maxLength; > - } > + return expandCodeToOutputStack(effectiveCode, > addedUnfinishedEntry); > } > - count(bytesRead); > - return bytesRead; > } > } > > Added: > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/z/AbstractLZWInputStream.java > URL: > http://svn.apache.org/viewvc/commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/z/AbstractLZWInputStream.java?rev=1551026&view=auto > ============================================================================== > --- > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/z/AbstractLZWInputStream.java > (added) > +++ > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/z/AbstractLZWInputStream.java > Sun Dec 15 15:23:35 2013 > @@ -0,0 +1,201 @@ > +/* > + * Licensed to the Apache Software Foundation (ASF) under one > + * or more contributor license agreements. See the NOTICE file > + * distributed with this work for additional information > + * regarding copyright ownership. The ASF licenses this file > + * to you under the Apache License, Version 2.0 (the > + * "License"); you may not use this file except in compliance > + * with the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, > + * software distributed under the License is distributed on an > + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY > + * KIND, either express or implied. See the License for the > + * specific language governing permissions and limitations > + * under the License. > + */ > +package org.apache.commons.compress.compressors.z; > + > +import java.io.IOException; > +import java.io.InputStream; > + > +import org.apache.commons.compress.compressors.CompressorInputStream; > + > +/** > + * Base-class for traditional Unix ".Z" compression and the > + * Unshrinking method of ZIP archive. > + * @NotThreadSafe > + * @since 1.7 > + */ > +public abstract class AbstractLZWInputStream extends CompressorInputStream { > + private final byte[] oneByte = new byte[1]; > + > + protected final InputStream in; > + protected int clearCode = -1; > + protected int codeSize = 9; > + protected int bitsCached = 0; > + protected int bitsCachedSize = 0; > + protected int previousCode = -1; > + protected int tableSize = 0; > + protected int[] prefixes; > + protected byte[] characters; > + private byte[] outputStack; > + private int outputStackLocation; > + > + public AbstractLZWInputStream(InputStream inputStream) throws > IOException { > + this.in = inputStream; > + } > + > + @Override > + public void close() throws IOException { > + in.close(); > + } > + > + @Override > + public int read() throws IOException { > + int ret; > + while ((ret = read(oneByte)) == 0) { // NOPMD > + } > + if (ret < 0) { > + return ret; > + } > + return 0xff & oneByte[0]; > + } > + > + @Override > + public int read(byte[] b, int off, int len) throws IOException { > + int bytesRead = 0; > + int remainingInStack = outputStack.length - outputStackLocation; > + if (remainingInStack > 0) { > + int maxLength = Math.min(remainingInStack, len); > + System.arraycopy(outputStack, outputStackLocation, b, off, > maxLength); > + outputStackLocation += maxLength; > + off += maxLength; > + len -= maxLength; > + bytesRead += maxLength; > + } > + while (len > 0) { > + int result = decompressNextSymbol(); > + if (result < 0) { > + if (bytesRead > 0) { > + count(bytesRead); > + return bytesRead; > + } else { > + return result; > + } > + } > + remainingInStack = outputStack.length - outputStackLocation; > + if (remainingInStack > 0) { > + int maxLength = Math.min(remainingInStack, len); > + System.arraycopy(outputStack, outputStackLocation, b, off, > maxLength); > + outputStackLocation += maxLength; > + off += maxLength; > + len -= maxLength; > + bytesRead += maxLength; > + } > + } > + count(bytesRead); > + return bytesRead; > + } > + > + /** > + * Read the next code and expand it. > + */ > + protected abstract int decompressNextSymbol() throws IOException; > + > + /** > + * Add a new entry to the dictionary. > + */ > + protected abstract int addEntry(int previousCode, byte character) > + throws IOException; > + > + /** > + * Sets the clear code based on the code size. > + */ > + protected void setClearCode(int codeSize) { > + clearCode = (1 << (codeSize - 1)); > + } > + > + /** > + * Initializes the arrays based on the maximum code size. > + */ > + protected void initializeTables(int maxCodeSize) { > + final int maxTableSize = 1 << maxCodeSize; > + prefixes = new int[maxTableSize]; > + characters = new byte[maxTableSize]; > + outputStack = new byte[maxTableSize]; > + outputStackLocation = maxTableSize; > + final int max = 1 << 8; > + for (int i = 0; i < max; i++) { > + prefixes[i] = -1; > + characters[i] = (byte) i; > + } > + } > + > + /** > + * Reads the next code from the stream. > + */ > + protected int readNextCode() throws IOException { > + while (bitsCachedSize < codeSize) { > + final int nextByte = in.read(); > + if (nextByte < 0) { > + return nextByte; > + } > + bitsCached |= (nextByte << bitsCachedSize); > + bitsCachedSize += 8; > + } > + final int mask = (1 << codeSize) - 1; > + final int code = (bitsCached & mask); > + bitsCached >>>= codeSize; > + bitsCachedSize -= codeSize; > + return code; > + } > + > + /** > + * Adds a new entry if the maximum table size hasn't been exceeded > + * and returns the new index. > + */ > + protected int addEntry(int previousCode, byte character, int > maxTableSize) { > + if (tableSize < maxTableSize) { > + final int index = tableSize; > + prefixes[tableSize] = previousCode; > + characters[tableSize] = character; > + tableSize++; > + return index; > + } > + return -1; > + } > + > + /** > + * Add entry for repeat of previousCode we haven't added, yet. > + */ > + protected int addRepeatOfPreviousCode() throws IOException { > + if (previousCode == -1) { > + // can't have a repeat for the very first code > + throw new IOException("The first code can't be a reference to > its preceding code"); > + } > + byte firstCharacter = 0; > + for (int last = previousCode; last >= 0; last = prefixes[last]) { > + firstCharacter = characters[last]; > + } > + return addEntry(previousCode, firstCharacter); > + } > + > + /** > + * Expands the entry with index code to the output stack and may > + * create a new entry > + */ > + protected int expandCodeToOutputStack(int code, boolean > addedUnfinishedEntry) > + throws IOException { > + for (int entry = code; entry >= 0; entry = prefixes[entry]) { > + outputStack[--outputStackLocation] = characters[entry]; > + } > + if (previousCode != -1 && !addedUnfinishedEntry) { > + addEntry(previousCode, outputStack[outputStackLocation]); > + } > + previousCode = code; > + return outputStackLocation; > + } > +} > > Propchange: > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/z/AbstractLZWInputStream.java > ------------------------------------------------------------------------------ > svn:eol-style = native > > Modified: > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/z/ZCompressorInputStream.java > URL: > http://svn.apache.org/viewvc/commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/z/ZCompressorInputStream.java?rev=1551026&r1=1551025&r2=1551026&view=diff > ============================================================================== > --- > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/z/ZCompressorInputStream.java > (original) > +++ > commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/z/ZCompressorInputStream.java > Sun Dec 15 15:23:35 2013 > @@ -21,35 +21,22 @@ package org.apache.commons.compress.comp > import java.io.IOException; > import java.io.InputStream; > > -import org.apache.commons.compress.compressors.CompressorInputStream; > - > /** > * Input stream that decompresses .Z files. > * @NotThreadSafe > * @since 1.7 > */ > -public class ZCompressorInputStream extends CompressorInputStream { > +public class ZCompressorInputStream extends AbstractLZWInputStream { > private static final int MAGIC_1 = 0x1f; > private static final int MAGIC_2 = 0x9d; > private static final int BLOCK_MODE_MASK = 0x80; > private static final int MAX_CODE_SIZE_MASK = 0x1f; > - private final InputStream in; > private final boolean blockMode; > - private final int clearCode; > private final int maxCodeSize; > - private int codeSize = 9; > - private int bitsCached = 0; > - private int bitsCachedSize = 0; > private long totalCodesRead = 0; > - private int previousCode = -1; > - private int tableSize = 0; > - private final int[] prefixes; > - private final byte[] characters; > - private final byte[] outputStack; > - private int outputStackLocation; > > public ZCompressorInputStream(InputStream inputStream) throws > IOException { > - this.in = inputStream; > + super(inputStream); > int firstByte = in.read(); > int secondByte = in.read(); > int thirdByte = in.read(); > @@ -59,26 +46,12 @@ public class ZCompressorInputStream exte > blockMode = ((thirdByte & BLOCK_MODE_MASK) != 0); > maxCodeSize = thirdByte & MAX_CODE_SIZE_MASK; > if (blockMode) { > - clearCode = (1 << (codeSize - 1)); > - } else { > - clearCode = -1; // unused > - } > - final int maxTableSize = 1 << maxCodeSize; > - prefixes = new int[maxTableSize]; > - characters = new byte[maxTableSize]; > - outputStack = new byte[maxTableSize]; > - outputStackLocation = maxTableSize; > - for (int i = 0; i < (1 << 8); i++) { > - prefixes[i] = -1; > - characters[i] = (byte)i; > + setClearCode(codeSize); > } > + initializeTables(maxCodeSize); > clearEntries(); > } > > - public void close() throws IOException { > - in.close(); > - } > - > private void clearEntries() { > tableSize = (1 << 8); > if (blockMode) { > @@ -86,20 +59,12 @@ public class ZCompressorInputStream exte > } > } > > - private int readNextCode() throws IOException { > - while (bitsCachedSize < codeSize) { > - final int nextByte = in.read(); > - if (nextByte < 0) { > - return nextByte; > - } > - bitsCached |= (nextByte << bitsCachedSize); > - bitsCachedSize += 8; > + @Override > + protected int readNextCode() throws IOException { > + int code = super.readNextCode(); > + if (code >= 0) { > + ++totalCodesRead; > } > - final int mask = (1 << codeSize) - 1; > - final int code = (bitsCached & mask); > - bitsCached >>>= codeSize; > - bitsCachedSize -= codeSize; > - ++totalCodesRead; > return code; > } > > @@ -119,22 +84,19 @@ public class ZCompressorInputStream exte > bitsCachedSize = 0; > } > > - private void addEntry(int previousCode, byte character) throws > IOException { > + @Override > + protected int addEntry(int previousCode, byte character) throws > IOException { > final int maxTableSize = 1 << codeSize; > - if (tableSize < maxTableSize) { > - prefixes[tableSize] = previousCode; > - characters[tableSize] = character; > - tableSize++; > - } > - if (tableSize == maxTableSize) { > - if (codeSize < maxCodeSize) { > - reAlignReading(); > - codeSize++; > - } > + int r = addEntry(previousCode, character, maxTableSize); > + if (tableSize == maxTableSize && codeSize < maxCodeSize) { > + reAlignReading(); > + codeSize++; > } > + return r; > } > > - private int decompressNextSymbol() throws IOException { > + @Override > + protected int decompressNextSymbol() throws IOException { > // > // table entry table entry > // _____________ _____ > @@ -159,74 +121,13 @@ public class ZCompressorInputStream exte > } else { > boolean addedUnfinishedEntry = false; > if (code == tableSize) { > - // must be a repeat of the previous entry we haven't added > yet > - if (previousCode == -1) { > - // ... which isn't possible for the very first code > - throw new IOException("The first code can't be a > reference to its preceding code"); > - } > - byte firstCharacter = 0; > - for (int last = previousCode; last >= 0; last = > prefixes[last]) { > - firstCharacter = characters[last]; > - } > - addEntry(previousCode, firstCharacter); > + addRepeatOfPreviousCode(); > addedUnfinishedEntry = true; > } else if (code > tableSize) { > throw new IOException(String.format("Invalid %d bit code > 0x%x", codeSize, code)); > } > - for (int entry = code; entry >= 0; entry = prefixes[entry]) { > - outputStack[--outputStackLocation] = characters[entry]; > - } > - if (previousCode != -1 && !addedUnfinishedEntry) { > - addEntry(previousCode, outputStack[outputStackLocation]); > - } > - previousCode = code; > - return outputStackLocation; > - } > - } > - > - public int read() throws IOException { > - byte[] b = new byte[1]; > - int ret; > - while ((ret = read(b)) == 0) { > + return expandCodeToOutputStack(code, addedUnfinishedEntry); > } > - if (ret < 0) { > - return ret; > - } > - return 0xff & b[0]; > } > > - public int read(byte[] b, int off, int len) throws IOException { > - int bytesRead = 0; > - int remainingInStack = outputStack.length - outputStackLocation; > - if (remainingInStack > 0) { > - int maxLength = Math.min(remainingInStack, len); > - System.arraycopy(outputStack, outputStackLocation, b, off, > maxLength); > - outputStackLocation += maxLength; > - off += maxLength; > - len -= maxLength; > - bytesRead += maxLength; > - } > - while (len > 0) { > - int result = decompressNextSymbol(); > - if (result < 0) { > - if (bytesRead > 0) { > - count(bytesRead); > - return bytesRead; > - } else { > - return result; > - } > - } > - remainingInStack = outputStack.length - outputStackLocation; > - if (remainingInStack > 0) { > - int maxLength = Math.min(remainingInStack, len); > - System.arraycopy(outputStack, outputStackLocation, b, off, > maxLength); > - outputStackLocation += maxLength; > - off += maxLength; > - len -= maxLength; > - bytesRead += maxLength; > - } > - } > - count(bytesRead); > - return bytesRead; > - } > } > > --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@commons.apache.org For additional commands, e-mail: dev-h...@commons.apache.org