patch: gzip compress 26% to 43% faster at 6<=level

John Reiser Tue, 02 Jul 2013 08:52:32 -0700

Using my previous patch set
   http://lists.gnu.org/archive/html/bug-gzip/2013-06/msg00027.html
I tried compressing using direct indexing and separate chaining of
3-strings: HASH_BITS=24, H_SHIFT=8.  This was too slow because
the typical L2 cache of 256KiB or 512KiB suffered from many misses,
including "sweeping".  However, separate chaining did decrease
greatly the number of positions that were examined by longest_match().
This is the leading term of overall run time for compressing.


Thus I changed to direct indexing by 2-strings (pairs) as a stem, and
for each stem value the set of 3rd characters is stored consecutively
as gathered leaves.  When processing a new window, then for each stem
the old leaves are scattered by value into an array, the new 3-chains
for that stem are added, and the new leaves are gathered back into con-
secutive storage.  This gives exact chaining of 3-strings but with nice
cache usage.  The memory cost is an additional 1/2 MiB or so, plus the
requirement for fast indexing of an array of 65,536 'short' elements.
(Sorry, MAXSEG_64K.)

That code often is 30% to 45% faster than previous for typical "sweet spot"
cases: approx. 0.37 < output/input < 0.55.  If the input already is compressed
then tracking the 3rd-character leaf values takes time but gives little
advantage: the 3-chains are all too short.  On the other side, for
many highly compressible inputs even the chains of 3-strings are long.
Chains of 4-strings look desirable, but this is left for future work.
For now, the scan of a new window computes a histogram of the pairs,
and reverts to chaining just the 2-strings (pairs) if all chains are short.
Computing the histogram costs a few percent, but prevents a bad slowdown
when a window already is compressed.

At compression level 6 or 7 often the output is around 1/2000 (0.05%)
larger than the previous result.  At level 8 or 9 the new result often
is very slightly smaller.  Compression level 8 (not 9) often is pleasingly
fast, with output that is essentially the same size as previously.
These results suggest additional tuning of the effective internal
parameters derived from configuration_table[].

Sliding the accounting data can become expensive.  If the compiler
generates a conditional branch for
   (Pos)((m > slide) ? (m - slide) : 0)
then that branch often is unpredictable and can take 15 or more cycles for
many elements.  An equivalent evaluation using sign-extending right shift:
   int d = m - slide;
   (Pos)(d &~ (d >> (2==sizeof(int)?15:4==sizeof(int)?31:63)))
takes a few cycles and avoids conditional branch entirely.
Many current processors can perform "saturating subtract"
of 2, 4, 8, or 16 SIMD elements in one cycle, but the details
are machine dependent.  For example:
   
https://github.com/kaffeemonster/zlib/commit/aa30d206d93755c1f5c287b78dd7d165fb0998a8
Unrolling and pipelining SIMD in assembler can achieve optimality:
each available cache cycle is used and is full-width productive.
The machine-dependent benefit is *several percent* faster compressing,
end-to-end.  Maintainers should step up to handle such a dependency.

Exact 3-chains are expensive for low-effort compression.  (level <= 5)
should use the previous hashing technique.  The code would chose at run time
among several versions of longest_match(): one for level <= 5, one when
all chains in the window are short, one for the anticipated "sweet spot" when
3-chains greatly shorten search, one when too many 3-chains are long.

--

diff --git a/Makefile.am b/Makefile.am
index a3fade7..97176d9 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -51,7 +51,7 @@ bin_PROGRAMS = gzip
 bin_SCRIPTS = gunzip gzexe zcat zcmp zdiff \
   zegrep zfgrep zforce zgrep zless zmore znew
 gzip_SOURCES = \
-  bits.c deflate.c gzip.c inflate.c lzw.c \
+  bits.c chain.c deflate.c gzip.c inflate.c lzw.c \
   trees.c unlzh.c unlzw.c unpack.c unzip.c util.c zip.c
 gzip_LDADD = libver.a lib/libgzip.a
 gzip_LDADD += $(LIB_CLOCK_GETTIME)
diff --git a/chain.c b/chain.c
new file mode 100644
index 0000000..790aa0d
--- /dev/null
+++ b/chain.c
@@ -0,0 +1,322 @@
+/* chain.c -- link positions by 2- or 3-byte values
+
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   Copyright (C) 2013 BitWagon Software LLC.  Written by John Reiser.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#include <config.h>
+#include "tailor.h"
+#include "gzip.h"
+#include <stdlib.h>  /* MY_DEBUG abort */
+#include "deflate.h"
+
+/* Saturating subtract without "multimedia instructions",
+ * but still avoiding unpredictable conditional branch (which is slow.)
+ * Architecture-dependent SIMD would save another *several percent* overall
+ * but maintainers are unwilling.
+ * x86: psubusw;  ARM: vqsub.u16;  PowerPC: vsubuhs;  etc.
+ */
+static void slide_Pos(Pos *dst, Pos const *src, unsigned slide, unsigned n)
+{
+    if (n) do {
+#if (-1)==((-1)>>1)  /* two's complement with right shift that sign extends */
+        int const d = *src++ - slide;
+        /* 0 <= d:     d &~  0  ==>  d.
+         * d <  0:     d &~ ~0  ==>  0.
+         */
+        *dst++ = (Pos)(d &~ (d >> (2==sizeof(int)?15:4==sizeof(int)?31:63)));
+#else  /* unpredictable branch is slow unless conditional move */
+        unsigned m = *src++;
+        *dst++ = (Pos)((m >= slide) ? (m - slide) : 0);
+#endif
+    } while (--n);
+}
+
+#define N_ctxlen (1<<(2*8))  /* pairs of bytes */
+local uch ctxlen[N_ctxlen];
+
+/* Accounting for 3-chains: */
+local Pos x3hi[N_ctxlen];  /* hi index in pos3, chr3 */
+local Pos pos3[2*WSIZE];  /* gathered Pos ends of 3-chains; two generations */
+local uch chr3[2*WSIZE];  /* gathered last uch of 3-chains; two generations */
+Pos work[1<<8];           /* [uch] Pos end of 3-chain */
+
+local Pos *oldp3= &pos3[0], *newp3= &pos3[WSIZE];
+local uch *oldc3= &chr3[0], *newc3= &chr3[WSIZE];
+
+#define MY_DEBUG 0
+
+local void chain3_part3(
+    uch const *const bow,  /* lo end including look-back */
+    uch const *const bow2, /* lo end of new data  (used only by MY_DEBUG) */
+    uch const *const eow,  /* hi end of new data  (used only by MY_DEBUG) */
+    unsigned const rim
+)
+{
+    unsigned old_x3lo= 0;
+    unsigned x3new= 0;
+    unsigned j;
+    for (j= 0; j <= (-1+ N_ctxlen); ++j) {
+        unsigned const old_x3hi = x3hi[j];
+        unsigned       pos  =     head[j];
+        if (MY_DEBUG) if (old_x3hi < old_x3lo) abort();
+        if (pos) { /* non-empty chain;  In: work[] is zero */
+                                  head[j] = 0;  /* head[] is zero at return */
+            unsigned const n_old = old_x3hi - old_x3lo;
+            unsigned const new_x3lo= x3new;
+            if (1==rim && n_old) { /* Propagate the old chains. */
+                memcpy(&newp3[x3new], &oldp3[old_x3lo], sizeof(Pos)*n_old);
+                memcpy(&newc3[x3new], &oldc3[old_x3lo], sizeof(uch)*n_old);
+                x3new += n_old;
+            }
+            unsigned k;
+            /* Scatter the ending Pos of old chains. */
+            for (k= old_x3lo; k < old_x3hi; ++k) {
+                work[oldc3[k]] = oldp3[k];
+            }
+            /* Append new chains. */
+            do {
+                unsigned const c = bow[-1+ MIN_MATCH + pos];
+                newc3[x3new] = c;  /* speculative */
+                      x3new += (work[c] < rim);  /* avoid branch */
+                unsigned const
+                next = prev[pos];
+                       prev[pos] = work[c];
+                                   work[c] = pos;
+                                             pos = next;
+            } while (pos);
+            /* Gather the ending Pos of new chains. */
+            /* Clear work[] of new chains. */
+            for (k= new_x3lo; k < x3new; ++k) {
+                /* gcc thinks that newp3[] and work[] might alias? */
+                pos = work[newc3[k]];
+                      work[newc3[k]] = 0;
+                if (MY_DEBUG) {
+                    unsigned prv;
+                    for (prv = prev[pos]; prv; prv = prev[prv])
+                        if (memcmp(&bow[prv], &bow[pos], MIN_MATCH)) abort();
+                }
+                if (MY_DEBUG) if (new_x3lo!=k && newp3[-1+ k]==pos) abort();
+                newp3[k]= pos;
+            }
+            /* Clear work[] of old chains. */
+            for (k= old_x3lo; k < old_x3hi; ++k) {
+                work[oldc3[k]] = 0;
+            }
+            /* work[] now is all-zero (again). */
+            if (MY_DEBUG) for (k=0; k < (1<<8); ++k) if (work[k]) abort();
+        }
+        old_x3lo = old_x3hi;
+        if (MY_DEBUG) if (j && x3new < x3hi[-1+ j]) abort();
+
+        x3hi[j] = x3new;
+    }
+    if (MY_DEBUG) if ((eow - bow2) < x3new) abort();
+    if (MY_DEBUG) {
+        unsigned j;
+        for (j=1; j <= (-1+  N_ctxlen); ++j) {
+            if (x3hi[j] < x3hi[-1+ j]) abort();
+            if (head[j]) abort();
+        }
+    }
+    
+#define SWAP(a, b) {typeof(a) const t= a; a= b; b= t;}
+    SWAP(oldp3, newp3);
+    SWAP(oldc3, newc3);
+}
+
+/* Client longest_match() uses only prev[] which has descending chain
+ * of Pos that match.
+ * If CHAIN_2==chain_style then match is exact 2 bytes.
+ * If CHAIN_3==chain_style then match is exact 3 bytes.
+ * If CHAIN_4==chain_style then match is exact 3 bytes, and there are many
+ * long chains.  We might wish to chain by exact match of more than 3 bytes,
+ * but that seems to be too expensive.  Also longest_match wants to know
+ * about 3-byte matches anyway.
+ *
+ * Internally, if CHAIN_2==chain_style then head[] has Pos of most recent match
+ * (2 bytes).  If CHAIN_3==chain_style then head[] is zero, and the most recent
+ * match is kept in pos3[] and chr3[], which are the gathered ends of 3-matches.
+ * x3hi[] is the index by pairs.  Thus x3hi corresponds to the 2-byte stem,
+ * with pos3 and chr3 being the 1-byte leaves of a 3-match.  Direct indexing by
+ * 3 bytes (table length is 1<<(3*8) or 16Mi elements) has poor performance
+ * because it is much larger than typical L2 caches of 256KiB or 512KiB.
+ */
+
+unsigned chain_style = CHAIN_3;  /* default style of chaining */
+
+int do_chain(
+    uch const *const bow,  /* lo end including look-back */
+    uch const *const bow2, /* lo end of new data */
+    uch const *const eow,  /* hi end of new data */
+    IPos rim,  /* threshold for new chains */
+    unsigned slide  /* how much to slide prev[] */
+)
+{
+    if (eow==bow2) return 0;  /* new data is empty */
+
+    /* XXX do_chain of more than WSIZE is very messy. */
+    /* Handle <= WSIZE at a time; 1st call can be twice as big. */
+    if (WSIZE < (eow - bow2)) {
+               do_chain(bow, bow, &bow[WSIZE],      rim,   slide);
+        return do_chain(bow,      &bow[WSIZE], eow, WSIZE, slide);
+    }
+
+    unsigned n_wrap = 0;
+    memset(ctxlen, 0, sizeof(ctxlen));  /* clear histogram of pairs */
+    if (slide) {
+        slide_Pos(&prev[0], &prev[slide], slide, slide);
+    }
+
+#define HIST_CUT 0x40
+/* While scanning new data, build histogram of pairs in order to characterize */
+/* distribution of pairs.  Average count is 1/2; reset overflows to HIST_CUT */
+/* in order to reduce frequency of reset but distinguish actual small counts. */
+
+/* Assume that style continues from previous window. */
+    unsigned const old_style = chain_style;
+    switch (chain_style) {
+    case CHAIN_2: {
+        if (slide) {
+            slide_Pos(&head[0], &head[0], slide, LEN_head);
+        }
+        uch const *const hilim= &eow[1- MIN_MATCH];
+        uch const *const lolim= (bow <= &bow2[1- MIN_MATCH])
+            ? &bow2[1- MIN_MATCH]
+            : &bow2[1];  /* prevent 0==pos because that is NIL */
+        if (1!=rim)  /* (1- MIN_MATCH) is an internal detail */
+            rim = lolim - bow;
+        uch const *ptr;
+        for (ptr= lolim; ptr < hilim; ++ptr) { /* ascending scan */
+            unsigned const p0 = GET2(ptr);
+            prev[ptr - bow] = head[p0];
+                              head[p0] = ptr - bow;
+            if (0== ++ctxlen[p0]) {
+                ctxlen[p0] = HIST_CUT;
+                ++n_wrap;
+            }
+        }
+    } break;
+    case CHAIN_4:
+    case CHAIN_3: { /* In: head[] is zero */
+        uch const *const lowlim= (bow <= &bow2[1- MIN_MATCH])
+            ? &bow2[1- MIN_MATCH]
+            : &bow2[1];  /* prevent 0==pos because that is NIL */
+        if (1!=rim)  /* (1- MIN_MATCH) is an internal detail */
+            rim = lowlim - bow;
+        uch const *ptr= &eow[1- MIN_MATCH];
+        while (lowlim < ptr) { /* descending scan */
+            --ptr;
+            unsigned const p0 = GET2(ptr);
+            prev[ptr - bow] = head[p0];
+                              head[p0] = ptr - bow;
+            if (0== ++ctxlen[p0]) {
+                ctxlen[p0] = HIST_CUT;
+                ++n_wrap;
+            }
+        }
+    } break;
+    }
+
+/* Analyze distribution of pairs in new data. */
+    { /* In: work[] is zero */
+        unsigned j;
+        for (j= 0; j<= (-1+ N_ctxlen); ++j)
+            ++work[ctxlen[j]];
+
+        memset(work, 0, 16 * sizeof(work[0]));  /* ignore short chains */
+        unsigned tot = 0;
+        for (j= 16; j<(1<<8); j+=4) {
+            tot += work[0+ j]; work[0+ j] = 0;
+            tot += work[1+ j]; work[1+ j] = 0;
+            tot += work[2+ j]; work[2+ j] = 0;
+            tot += work[3+ j]; work[3+ j] = 0;
+        }
+
+        chain_style = CHAIN_3;  /* assume usual compressability */
+        if (0==tot) { /* All chains are < 16; previously compressed? */
+            chain_style = CHAIN_2;  /* Omit the work for CHAIN_3. */
+        }
+        else if (352 <= (tot + n_wrap)) { /* XXX: value 352 is heuristic. */
+            /* Many long chains; longest match is tedious to find. */
+            /* Chains might remain long even after work here for CHAIN_3. */
+            /* XXX Try CHAIN_2 ?  Scanning might be faster than tracking!! */
+            chain_style = CHAIN_4;
+        }
+    } /* Out: work[] is zero */
+
+/* Fix conflict in style, if any. */
+    if (old_style!=chain_style) switch (chain_style) {
+    case CHAIN_2: { /* old was CHAIN_3; new is CHAIN_2 */
+        memset(head, 0, LEN_head*sizeof(head[0]));  /* Empty all chains. */
+
+        /* Scan entire window ascending. */
+        prev[0] = NIL;  /* paranoia */
+        uch const *const hilim= &eow[1- MIN_MATCH];
+        uch const *ptr= &bow[1];
+        for (; ptr < hilim; ++ptr) {
+            unsigned const p0 = GET2(ptr);
+            prev[ptr - bow] = head[p0];
+                              head[p0] = ptr - bow;
+        }
+    } break;
+    case CHAIN_4:
+    case CHAIN_3: if (CHAIN_2==old_style) {
+        memset(head, 0, LEN_head*sizeof(head[0]));  /* Empty all chains. */
+
+        /* Scan old window descending. */
+        uch const *lowlim= &bow[1];
+        uch const *ptr= &bow2[1- MIN_MATCH];
+        while (lowlim < ptr) { /* descending scan */
+            --ptr;
+            unsigned const p0 = GET2(ptr);
+            prev[ptr - bow] = head[p0];
+                              head[p0] = ptr - bow;
+        }
+        /* Chain the old window with 1==rim */
+        memset(x3hi, 0, sizeof(x3hi));  /* Empty all chains. */
+        chain3_part3(bow, bow, bow2, 1);
+
+        /* Scan new window descending. */
+        lowlim = &bow2[1- MIN_MATCH];
+        ptr = &eow[1- MIN_MATCH];
+        while (lowlim < ptr) { /* descending scan */
+            --ptr;
+            unsigned const p0 = GET2(ptr);
+            prev[ptr - bow] = head[p0];
+                              head[p0] = ptr - bow;
+        }
+        slide = 0;
+    } break;
+    } /* end switch() */
+
+/* Finish chaining the window according to style of new data. */
+    switch (chain_style) {
+    case CHAIN_2: {
+        /* Nothing remains to be done. */
+    } break;
+    case CHAIN_4:
+    case CHAIN_3: {
+        if (slide) {
+            slide_Pos(&oldp3[0], &oldp3[0], slide, x3hi[-1+ N_ctxlen]);
+        }
+        chain3_part3(bow, bow2, eow, rim);
+    } break;
+    }
+
+    return 0;
+}
+
diff --git a/deflate.c b/deflate.c
index 7fb6ed2..6e0024a 100644
--- a/deflate.c
+++ b/deflate.c
@@ -29,39 +29,14 @@
  *      of the input text which are identical to earlier input (within a
  *      sliding window trailing behind the input currently being processed).
  *
- *      The most straightforward technique turns out to be the fastest for
- *      most input files: try all possible matches and select the longest.
- *      The key feature of this algorithm is that insertions into the string
- *      dictionary are very simple and thus fast, and deletions are avoided
- *      completely. Insertions are performed at each input character, whereas
- *      string matches are performed only when the previous match ends. So it
- *      is preferable to spend more time in matches to allow very fast string
- *      insertions and avoid deletions. The matching algorithm for small
- *      strings is inspired from that of Rabin & Karp. A brute force approach
- *      is used to find longer strings when a small match has been found.
- *      A similar algorithm is used in comic (by Jan-Mark Wams) and freeze
- *      (by Leonid Broukhis).
- *         A previous version of this file used a more sophisticated algorithm
- *      (by Fiala and Greene) which is guaranteed to run in linear amortized
- *      time, but has a larger average cost, uses more memory and is patented.
- *      However the F&G algorithm may be faster for some highly redundant
- *      files if the parameter max_chain_length (described below) is too large.
- *
- *  ACKNOWLEDGEMENTS
- *
- *      The idea of lazy evaluation of matches is due to Jan-Mark Wams, and
- *      I found it in 'freeze' written by Leonid Broukhis.
- *      Thanks to many info-zippers for bug reports and testing.
- *
- *  REFERENCES
- *
- *      APPNOTE.TXT documentation file in PKZIP 1.93a distribution.
- *
- *      A description of the Rabin and Karp algorithm is given in the book
- *         "Algorithms" by R. Sedgewick, Addison-Wesley, p252.
- *
- *      Fiala,E.R., and Greene,D.H.
- *         Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595
+ *      The classic gzip code used clever "no-delete" hashing to operate under
+ *      the constraint of MAXSEG_64K.  That code is close to optimal for cases
+ *      where fastest execution is desired, and should be re-instated for
+ *      "low-effort" compression: level <= 5.  The current code here uses more
+ *      and larger arrays (short[65,536]) and exact chaining (no collisions)
+ *      in an attempt to speed compression at "standard effort" (default
+ *      6==level) and "extra effort" (7<=level) while retaining a compression
+ *      ratio nearly the same as the old classic gzip code.
  *
  *  INTERFACE
  *
@@ -86,13 +61,6 @@
  */
 
 #include "deflate.h"
-#define HASH_SIZE (unsigned)(1<<HASH_BITS)
-#define HASH_MASK (HASH_SIZE-1)
-#define WMASK     (WSIZE-1)
-/* HASH_SIZE and WSIZE must be powers of two */
-
-#define NIL 0
-/* Tail of hash chains */
 
 #define FAST 4
 #define SLOW 2
@@ -107,12 +75,6 @@
  * Local data used by the "longest match" routines.
  */
 
-typedef ush Pos;
-typedef unsigned IPos;
-/* A Pos is an index in the character window. We use short instead of int to
- * save space in the various tables. IPos is used only for parameter passing.
- */
-
 /* DECLARE(uch, window, 2L*WSIZE); */
 /* Sliding window. Input bytes are read into the second half of the window,
  * and move to the first half later to keep a dictionary of at least WSIZE
@@ -124,13 +86,13 @@ typedef unsigned IPos;
  * be less efficient).
  */
 
-/* DECLARE(Pos, prev, WSIZE); */
+/* DECLARE(Pos, prev, LEN_prev); */
 /* Link to older string with same hash index. To limit the size of this
  * array to 64K, this link is maintained only for the last 32K strings.
  * An index in this array is thus a window index modulo 32K.
  */
 
-/* DECLARE(Pos, head, 1<<HASH_BITS); */
+/* DECLARE(Pos, head, LEN_head); */
 /* Heads of the hash chains or NIL. */
 
 static ulg window_size = (ulg)2*WSIZE;
@@ -145,13 +107,6 @@ long block_start;
 
 local unsigned ins_h;  /* hash index of string to be inserted */
 
-#define H_SHIFT  ((HASH_BITS+MIN_MATCH-1)/MIN_MATCH)
-/* Number of bits by which ins_h and del_h must be shifted at each
- * input step. It must be such that after MIN_MATCH steps, the oldest
- * byte no longer takes part in the hash key, that is:
- *   H_SHIFT * MIN_MATCH >= HASH_BITS
- */
-
        unsigned int near prev_length;
 /* Length of the best match at previous step. Matches not greater than this
  * are discarded. This is used in the lazy match evaluation.
@@ -251,7 +206,7 @@ local  void check_match (IPos start, IPos match, int length);
  *    input characters, so that a running hash key can be computed from the
  *    previous key instead of complete recalculation each time.
  */
-#define UPDATE_HASH(h,c) (h = (((h)<<H_SHIFT) ^ (c)) & HASH_MASK)
+#define UPDATE_HASH(h, s) /*(h = GET2(&window[s]))*/
 
 /* ===========================================================================
  * Insert string s in the dictionary and set match_head to the previous head
@@ -261,10 +216,8 @@ local  void check_match (IPos start, IPos match, int length);
  *    input characters and the first MIN_MATCH bytes of s are valid
  *    (except for the last MIN_MATCH-1 bytes of the input file).
  */
-#define INSERT_STRING(s, match_head) \
-   (UPDATE_HASH(ins_h, window[(s) + MIN_MATCH-1]), \
-    prev[(s) & WMASK] = match_head = head[ins_h], \
-    head[ins_h] = (s))
+#define INSERT_STRING(s, match_head) match_head = prev[s]
+/*   (UPDATE_HASH(ins_h, s), prev[s] = match_head = head[ins_h], head[ins_h] = (s)) */
 
 /* ===========================================================================
  * Initialize the "longest match" routines for a new file
@@ -273,16 +226,18 @@ void lm_init (pack_level, flags)
     int pack_level; /* 0: store, 1: best speed, 9: best compression */
     ush *flags;     /* general purpose bit flag */
 {
-    register unsigned j;
 
     if (pack_level < 1 || pack_level > 9) gzip_error ("bad pack level");
     compr_level = pack_level;
 
     /* Initialize the hash table. */
 #if defined(MAXSEG_64K) && HASH_BITS == 15
-    for (j = 0;  j < HASH_SIZE; j++) head[j] = NIL;
+    {
+        register unsigned j;
+        for (j = 0;  j < LEN_head; j++) head[j] = NIL;
+    }
 #else
-    memzero((char*)head, HASH_SIZE*sizeof(*head));
+    memzero((char*)head, LEN_head*sizeof(*head));
 #endif
     /* prev will be initialized on the fly */
 
@@ -306,25 +261,8 @@ void lm_init (pack_level, flags)
 #ifdef ASMV
     match_init(); /* initialize the asm code */
 #endif
-
-    lookahead = read_buf((char*)window,
-                         sizeof(int) <= 2 ? (unsigned)WSIZE : 2*WSIZE);
-
-    if (lookahead == 0 || lookahead == (unsigned)EOF) {
-       eofile = 1, lookahead = 0;
-       return;
-    }
     eofile = 0;
-    /* Make sure that we always have enough lookahead. This is important
-     * if input comes from a device such as a tty.
-     */
-    while (lookahead < MIN_LOOKAHEAD && !eofile) fill_window();
-
-    ins_h = 0;
-    for (j=0; j<MIN_MATCH-1; j++) UPDATE_HASH(ins_h, window[j]);
-    /* If lookahead < MIN_MATCH, ins_h is garbage, but this is
-     * not important since only literal bytes will be emitted.
-     */
+    fill_window();
 }
 
 /* ===========================================================================
@@ -377,6 +315,12 @@ longest_match(IPos cur_match)
     if (prev_length >= good_match) {
         chain_length >>= 2;
     }
+    if (CHAIN_3 <= chain_style) {
+        if (CHAIN_4==chain_style) {
+            chain_length += chain_length >> 1;  /* * 1.5 */
+        }
+        chain_length = (1& chain_length) | (chain_length >> 1);  /* * 0.5 */
+    }
     Assert(strstart <= window_size-MIN_LOOKAHEAD, "insufficient lookahead");
 
     do {
@@ -390,12 +334,10 @@ longest_match(IPos cur_match)
         /* This code assumes sizeof(unsigned short) == 2. Do not use
          * UNALIGNED_OK if your compiler uses a different size.
          */
-        if (*(ush*)(match+best_len-1) != scan_end ||
-            *(ush*)match != scan_start) continue;
+        if (*(ush*)(match+best_len-1) != scan_end
+        ||          match[2]          != scan[2] )     continue;
 
-        /* It is not necessary to compare scan[2] and match[2] since they are
-         * always equal when the other bytes match, given that the hash keys
-         * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
+        /* Compare 2 bytes at a time at
          * strstart+3, +5, ... up to strstart+257. We check for insufficient
          * lookahead only every 4th comparison; the 128th check will be made
          * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
@@ -422,16 +364,12 @@ longest_match(IPos cur_match)
 
         if (match[best_len]   != scan_end  ||
             match[best_len-1] != scan_end1 ||
-            *match            != *scan     ||
-            *++match          != scan[1])      continue;
-
+            match[2]          != scan[2] )      continue;
         /* The check at best_len-1 can be removed because it will be made
          * again later. (This heuristic is not always a win.)
-         * It is not necessary to compare scan[2] and match[2] since they
-         * are always equal when the other bytes match, given that
-         * the hash keys are equal and that HASH_BITS >= 8.
          */
-        scan += 2, match++;
+
+        scan += 2, match += 2;
 
         /* We check for insufficient lookahead only every 8th comparison;
          * the 256th check will be made at strstart+258.
@@ -459,7 +397,7 @@ longest_match(IPos cur_match)
             scan_end   = scan[best_len];
 #endif
         }
-    } while ((cur_match = prev[cur_match & WMASK]) > limit
+    } while ((cur_match = prev[cur_match]) > limit
              && --chain_length != 0);
 
     return best_len;
@@ -501,7 +439,8 @@ local void check_match(start, match, length)
  */
 local void fill_window()
 {
-    register unsigned n, m;
+    unsigned rim = 1;  /* at start, only NIL is "under the rim" */
+    unsigned slide = 0;
     unsigned more = (unsigned)(window_size - (ulg)lookahead - (ulg)strstart);
     /* Amount of free space at the end of the window. */
 
@@ -519,36 +458,31 @@ local void fill_window()
          */
         Assert(window_size == (ulg)2*WSIZE, "no sliding with BIG_MEM");
 
-        memcpy((char*)window, (char*)window+WSIZE, (unsigned)WSIZE);
+        slide = WSIZE;
+        memcpy((char*)window, (char const*)window+WSIZE, WSIZE);
+        block_start -= (long) WSIZE;
         match_start -= WSIZE;
         strstart    -= WSIZE; /* we now have strstart >= MAX_DIST: */
-
-        block_start -= (long) WSIZE;
-
-        for (n = 0; n < HASH_SIZE; n++) {
-            m = head[n];
-            head[n] = (Pos)(m >= WSIZE ? m-WSIZE : NIL);
-        }
-        for (n = 0; n < WSIZE; n++) {
-            m = prev[n];
-            prev[n] = (Pos)(m >= WSIZE ? m-WSIZE : NIL);
-            /* If n is not on any hash chain, prev[n] is garbage but
-             * its value will never be used.
-             */
-        }
-        more += WSIZE;
+        rim = strstart+lookahead;
+        more        += WSIZE;
     }
-    /* At this point, more >= 2 */
-    if (!eofile) {
-        n = read_buf((char*)window+strstart+lookahead, more);
+    unsigned const edge = strstart+lookahead;
+    while (lookahead < MIN_LOOKAHEAD && !eofile) {
+        /* At this point, more >= 2 */
+        unsigned const n = read_buf((char*)window+strstart+lookahead, more);
         if (n == 0 || n == (unsigned)EOF) {
             eofile = 1;
             /* Don't let garbage pollute the dictionary.  */
             memzero (window + strstart + lookahead, MIN_MATCH - 1);
         } else {
             lookahead += n;
+            more = (unsigned)(window_size - (ulg)lookahead - (ulg)strstart);
+            if (more==(unsigned)EOF)
+                more--;
         }
     }
+    if (edge!=(strstart+lookahead))
+        do_chain(window, window+edge, window+strstart+lookahead, rim, slide);
 }
 
 /* ===========================================================================
@@ -616,8 +550,9 @@ local off_t deflate_fast()
             } else {
                 strstart += match_length;
                 match_length = 0;
-                ins_h = window[strstart];
-                UPDATE_HASH(ins_h, window[strstart+1]);
+                ins_h = 0;
+                UPDATE_HASH(ins_h, strstart  );
+                UPDATE_HASH(ins_h, strstart+1);
 #if MIN_MATCH != 3
                 Call UPDATE_HASH() MIN_MATCH-3 more times
 #endif
@@ -636,7 +571,7 @@ local off_t deflate_fast()
          * for the next match, plus MIN_MATCH bytes to insert the
          * string following the next match.
          */
-        while (lookahead < MIN_LOOKAHEAD && !eofile) fill_window();
+        if (lookahead < MIN_LOOKAHEAD && !eofile) fill_window();
 
     }
     return FLUSH_BLOCK(1); /* eof */
@@ -742,7 +677,7 @@ off_t deflate()
          * for the next match, plus MIN_MATCH bytes to insert the
          * string following the next match.
          */
-        while (lookahead < MIN_LOOKAHEAD && !eofile) fill_window();
+        if (lookahead < MIN_LOOKAHEAD && !eofile) fill_window();
     }
     if (match_available) ct_tally (0, window[strstart-1]);
 
diff --git a/deflate.h b/deflate.h
index 43b02ea..9f4f4df 100644
--- a/deflate.h
+++ b/deflate.h
@@ -38,4 +38,34 @@
    /* For portability to 16 bit machines, do not use values above 15. */
 #endif
 
+typedef ush Pos;
+typedef unsigned IPos;
+/* A Pos is an index in the character window. We use short instead of int to
+ * save space in the various tables. IPos is used only for parameter passing.
+ */
+
+#define NIL 0
+/* Tail of hash chains */
+
+#ifdef UNALIGNED_OK
+#  define GET2(p) *(ush const *)(p)  /* native endian */
+#else
+#  define GET2(p) (((p)[0]<<8) | (p)[1])  /* happens to be big endian */
+#endif
+
+extern int do_chain(
+    uch const *const bow,  /* lo end including look-back */
+    uch const *const bow2, /* lo end of new data */
+    uch const *const eow,  /* hi end of new data */
+    IPos rim,  /* threshold for new chains */
+    unsigned slide
+);
+
+#define CHAIN_2 0  /* 2-byte exact */
+#define CHAIN_3 1  /* 3-byte exact, chain_length*0.5  */
+#define CHAIN_4 2  /* 3-byte exact, chain_length*0.75 */
+extern unsigned chain_style;  /* style of chaining */
+
+#define LEN_prev    (2L*WSIZE)  /* both old and new window */
+#define LEN_head  (1uL<<(2*8))  /* index by pair of bytes */
 #endif
diff --git a/gzip.c b/gzip.c
index 7309e84..6fa4cee 100644
--- a/gzip.c
+++ b/gzip.c
@@ -152,11 +152,10 @@ DECLARE(ush, d_buf,  DIST_BUFSIZE);
 #define LEN_window (2L*WSIZE)
 #define LEN_tab_suffix (1L<<LZW_BITS)
 DECLARE(uch, window, MAX(LEN_window, LEN_tab_suffix));
-#define LEN_prev        WSIZE
-#define LEN_head       (1L<<HASH_BITS)
 #ifndef MAXSEG_64K
 #   define LEN_tab_prefix   (1L<< LZW_BITS   )
-    DECLARE(ush, prev, MAX((LEN_prev + LEN_head), LEN_tab_prefix));
+    DECLARE(ush, prev, MAX(LEN_prev, LEN_tab_prefix));
+    DECLARE(ush, head,     LEN_head);
 #else
 #   define LEN_tab_prefix_0 (1L<<(LZW_BITS-1))
 #   define LEN_tab_prefix_1 (1L<<(LZW_BITS-1))
@@ -558,7 +557,8 @@ int main (int argc, char **argv)
     ALLOC(ush, d_buf,  DIST_BUFSIZE);
     ALLOC(uch, window, MAX(LEN_window, LEN_tab_suffix));
 #ifndef MAXSEG_64K
-    ALLOC(ush, prev, MAX((LEN_prev + LEN_head), LEN_tab_prefix));
+    ALLOC(ush, prev, MAX(LEN_prev, LEN_tab_prefix));
+    ALLOC(ush, head, LEN_HEAD);
 #else
     ALLOC(ush, prev, MAX(LEN_prev, LEN_tab_prefix0));
     ALLOC(ush, head, MAX(LEN_head, LEN_tab_prefix1));
@@ -1888,6 +1888,7 @@ local void do_exit(exitcode)
     FREE(window);
 #ifndef MAXSEG_64K
     FREE(prev);
+    FREE(head);
 #else
     FREE(prev);
     FREE(head);
diff --git a/gzip.h b/gzip.h
index acbaf55..43455d3 100644
--- a/gzip.h
+++ b/gzip.h
@@ -140,7 +140,7 @@ EXTERN(uch, window);         /* Sliding window */
 #ifndef MAXSEG_64K
    EXTERN(ush, prev);        /* hash link (see deflate.c) */
 #  define tab_prefix prev    /* prefix code (see unlzw.c) */
-#  define head (prev+WSIZE)  /* hash head (see deflate.c) */
+   EXTERN(ush, head);
 #else
    EXTERN(ush, prev);        /* hash link (see deflate.c) */
 #  define tab_prefix0 prev   /* prefix for even codes */

patch: gzip compress 26% to 43% faster at 6<=level

Reply via email to