On 28 March 2011 05:09, Michael Hope <[email protected]> wrote:
> Hi there. I'm looking for areas where the toolchain could generate
> faster code, and a good way of doing that is seeing how compiled code
> does against the best hand-written code. I know of skia, ffmpeg,
> pixman, Orc, and efl - what others are out there?
>
hi Michael,
Great motivation to optimize the existing libraries by NEON !
As far as I know, Android depends on several libraries, and some of
them are computing bound:
- libpixelflinger -- a bit like pixman
There is no official document about PixelFlinger, but you can always
check out its source:
http://android.git.kernel.org/?p=platform/system/core.git;a=summary
I submitted one NEON optimization patch for libpixelflinger to AOSP before:
https://review.source.android.com//#change,16358
- zlib
Using SIMD, we can optimize 'copy / repeat an existing sequence' in
LZ-style encoding.
The reference Intel SSE2 optimization patch is attached in this mail.
Sincerely,
-jserv
diff -urNp zlib-1.2.5-orig/deflate.c zlib-1.2.5/deflate.c
--- zlib-1.2.5-orig/deflate.c 2010-04-20 12:12:21.000000000 +0800
+++ zlib-1.2.5/deflate.c 2010-07-26 03:53:34.000000000 +0800
@@ -49,6 +49,17 @@
/* @(#) $Id$ */
+/* We can use 2-byte chunks only if 'unsigned short' has been defined
+ * appropriately and MAX_MATCH has the default value.
+ */
+#ifdef UNALIGNED_OK
+# include <limits.h>
+# include "zutil.h"
+# if (MAX_MATCH != 258) || (USHRT_MAX != 0xffff)
+# undef UNALIGNED_OK
+# endif
+#endif
+
#include "deflate.h"
const char deflate_copyright[] =
@@ -1119,7 +1130,8 @@ local uInt longest_match(s, cur_match)
* However the length of the match is limited to the lookahead, so
* the output of deflate is not affected by the uninitialized values.
*/
-#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
+#ifdef UNALIGNED_OK
+
/* This code assumes sizeof(unsigned short) == 2. Do not use
* UNALIGNED_OK if your compiler uses a different size.
*/
diff -urNp zlib-1.2.5-orig/deflate.h zlib-1.2.5/deflate.h
--- zlib-1.2.5-orig/deflate.h 2010-04-19 12:00:46.000000000 +0800
+++ zlib-1.2.5/deflate.h 2010-07-26 03:53:34.000000000 +0800
@@ -251,9 +251,12 @@ typedef struct internal_state {
ulg bits_sent; /* bit length of compressed data sent mod 2^32 */
#endif
- ush bi_buf;
+ ulg bi_buf;
/* Output buffer. bits are inserted starting at the bottom (least
- * significant bits).
+ * significant bits). Room for at least two short values to allow
+ * for a simpler overflow handling. However, if more than 16 bits
+ * have been buffered, it will be flushed and* and no more then 16
+ * bits will be in use afterwards.
*/
int bi_valid;
/* Number of valid bits in bi_buf. All bits above the last valid bit
@@ -274,6 +277,20 @@ typedef struct internal_state {
*/
#define put_byte(s, c) {s->pending_buf[s->pending++] = (c);}
+/* Output a short LSB first on the stream.
+ * IN assertion: there is enough room in pendingBuf.
+ */
+#if defined(LITTLE_ENDIAN) && defined(UNALIGNED_OK)
+# define put_short(s, w) { \
+ *(ush*)(s->pending_buf + s->pending) = (ush)(w);\
+ s->pending += 2; \
+}
+#else
+# define put_short(s, w) { \
+ put_byte(s, (uch)((w) & 0xff)); \
+ put_byte(s, (uch)((ush)(w) >> 8)); \
+}
+#endif
#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
/* Minimum amount of lookahead, except at the end of the input file.
diff -urNp zlib-1.2.5-orig/inffast.c zlib-1.2.5/inffast.c
--- zlib-1.2.5-orig/inffast.c 2010-04-19 12:16:23.000000000 +0800
+++ zlib-1.2.5/inffast.c 2010-07-26 03:53:34.000000000 +0800
@@ -1,5 +1,6 @@
/* inffast.c -- fast decoding
- * Copyright (C) 1995-2008, 2010 Mark Adler
+ * Copyright (C) 1995-2004, 2010 Mark Adler
+ * 2010 Optimizations by Stefan Fuhrmann
* For conditions of distribution and use, see copyright notice in zlib.h
*/
@@ -10,16 +11,35 @@
#ifndef ASMINF
+/* This is a highly optimized implementation of the decoder function for
+ * large code blocks. It cannot be used to decode close to the end of
+ * input nor output buffers (see below).
+ *
+ * Before trying to hand-tune assembly code for your target, you should
+ * make sure that alignment, endianess, word size optimizations etc. have
+ * already been enabled for the respective target platform.
+
+ * For MS VC++ 2008, the performance gain of specialized code against
+ * DISABLE_INFLATE_FAST_OPTIMIZATIONS (base line) is as follows:
+ *
+ * x86 (32 bit): +60% throughput
+ * x64 (64 bit): +70% throughput
+ *
+ * Measurements were taken on a Core i7 CPU with a mix of small and large
+ * buffers (110MB total) or varying content and an average compression rate
+ * of 2.2 .
+ */
+
/* Allow machine dependent optimization for post-increment or pre-increment.
- Based on testing to date,
- Pre-increment preferred for:
- - PowerPC G3 (Adler)
- - MIPS R5000 (Randers-Pehrson)
- Post-increment preferred for:
- - none
- No measurable difference:
- - Pentium III (Anderson)
- - M68060 (Nikl)
+ * Based on testing to date,
+ * Pre-increment preferred for:
+ * - PowerPC G3 (Adler)
+ * - MIPS R5000 (Randers-Pehrson)
+ * Post-increment preferred for:
+ * - none
+ * No measurable difference:
+ * - Pentium III (Anderson)
+ * - M68060 (Nikl)
*/
#ifdef POSTINC
# define OFF 0
@@ -29,6 +49,212 @@
# define PUP(a) *++(a)
#endif
+/* On a number of architectures, it is more efficient to
+ * read 64 bits from the input stream at once than only
+ * a 32 bit chunk. That allows for fewer memory accesses
+ * and calculations as well as for more aggressive loop
+ * unrolling.
+ */
+#if defined(_M_X64) || defined(__x86_64)
+# define HOLD_64BIT_CHUNKS
+#endif
+
+/* For debugging purposes, we may want to disable code
+ * optimizations as we won't be otherwise able to access
+ * alternative code paths.
+ * Please note that undefining these features does affect
+ * this file only.
+ */
+#ifdef DISABLE_INFLATE_FAST_OPTIMIZATIONS
+# ifdef UNALIGNED_OK
+# undef UNALIGNED_OK
+# endif
+# ifdef HOLD_64BIT_CHUNKS
+# undef HOLD_64BIT_CHUNKS
+# endif
+# ifdef LITTLE_ENDIAN
+# undef LITTLE_ENDIAN
+# endif
+# ifdef USE_SSE2
+# undef USE_SSE2
+# endif
+#endif
+
+/* A reusable code-snippet. It copies 'len' bytes from 'from'
+ * to 'out'. 'len' must be 3 or larger. This code will be used
+ * when no optimization will is available.
+ */
+#define STANDARD_MIN3_COPY\
+ while (len > 2) {\
+ PUP(out) = PUP(from);\
+ PUP(out) = PUP(from);\
+ PUP(out) = PUP(from);\
+ len -= 3;\
+ }\
+ if (len) { \
+ PUP(out) = PUP(from);\
+ if (len > 1)\
+ PUP(out) = PUP(from);\
+ }
+
+/* A reusable code-snippet. It copies data from 'from'to 'out'.
+ * up to 'last' with the last chunk possibly exceeding 'last'
+ * by up to 15 bytes.
+ */
+#ifdef USE_SSE2
+# include <emmintrin.h>
+# define TRY_CHUNKY_COPY\
+ if ((dist >= sizeof (__m128i)) || (last <= out)) { \
+ do {\
+ _mm_storeu_si128 ((__m128i*)(out+OFF), \
+ _mm_loadu_si128((const __m128i*)(from+OFF)));\
+ out += sizeof (__m128i);\
+ from += sizeof (__m128i);\
+ } while (out < last); \
+ }
+#else
+# define TRY_CHUNKY_COPY\
+ if (dist >= sizeof(long) || (last <= out)) { \
+ do {\
+ *(long*)(out+OFF) = *(long*)(from+OFF);\
+ out += sizeof (long);\
+ from += sizeof (long);\
+ } while (out < last); \
+ }
+#endif
+
+/* The 'copy / repeat an existing sequence' is at the core of LZ-
+ * style encoding. Therefore, whenever the CPU allows, we use few,
+ * unaligned 4-byte copies instead of many single-byte accesses.
+ *
+ * The local variable definition actually leads to better code
+ * being generated by the MS compiler.
+ */
+#ifdef UNALIGNED_OK
+# define QUICK_COPY\
+ {\
+ unsigned char FAR *from = out - dist;\
+ unsigned char FAR *last = out + len;\
+ TRY_CHUNKY_COPY\
+ else {\
+ do { \
+ *(out+OFF+0) = *(from+OFF+0);\
+ *(out+OFF+1) = *(from+OFF+1);\
+ *(out+OFF+2) = *(from+OFF+2);\
+ from += 3;\
+ out += 3;\
+ } while (out < last);\
+ }\
+ out = last;\
+ }
+#else
+# define QUICK_COPY\
+ from = out - dist;\
+ STANDARD_MIN3_COPY
+#endif
+
+/* Whenever we don't copy / repeat existing sequences, we add new
+ * literals. This is the code snippet that will be used in an
+ * unrolled loop for extracting literals one-by-one.
+ * We bail out if a non-literal has been found. We also assume that
+ * the loop head already made sure we don't read / write beyond
+ * buffer boundaries.
+ */
+#define EXTRACT_NEXT_IF_LITERAL\
+ here = lcode[hold & lmask];\
+ if (here.op != 0)\
+ goto dolen;\
+\
+ op = (unsigned)(here.bits);\
+ hold >>= op;\
+ bits -= op;\
+ Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?\
+ "inflate: literal '%c'\n" :\
+ "inflate: literal 0x%02x\n", here.val));\
+ PUP(out) = (unsigned char)(here.val);
+
+/* Unrolled loop content. Using 32 bit chunks, we can unroll it
+ * only once because every step consumes up to 9 bits of the
+ * input stream. We got 25/57 bits (using 32/64 bit chunks)
+ * entering the loop but must leave with at least 9 bits left
+ * for the top of the main loop.
+ */
+#if defined(HOLD_64BIT_CHUNKS)
+# define LITERAL_UNROLL_SIZE 5
+# define UNROLLED_LITERAL_LOOP {\
+ EXTRACT_NEXT_IF_LITERAL \
+ EXTRACT_NEXT_IF_LITERAL \
+ EXTRACT_NEXT_IF_LITERAL \
+ EXTRACT_NEXT_IF_LITERAL \
+ EXTRACT_NEXT_IF_LITERAL \
+ }
+#else
+# define LITERAL_UNROLL_SIZE 1
+# define UNROLLED_LITERAL_LOOP { EXTRACT_NEXT_IF_LITERAL }
+#endif
+
+/* Chunk that can be prefetched from the input stream.
+ */
+#if defined(HOLD_64BIT_CHUNKS)
+# define HOLD_TYPE unsigned long long
+#else
+# define HOLD_TYPE unsigned long
+#endif
+
+/* Code snipped that reads a single byte from 'in' and
+ * adds it to the prefetched ('hold') data.
+ */
+#define PREFETCH_BYTE \
+ hold += (HOLD_TYPE)(PUP(in)) << bits;\
+ bits += 8;
+
+/* Code snipped completely filling the prefetch variable.
+ */
+#if defined(LITTLE_ENDIAN) && defined(UNALIGNED_OK)
+# define TOP_UP_BITS \
+ {\
+ hold |= (*(HOLD_TYPE*)(in + OFF)) << bits;\
+ added = (sizeof (HOLD_TYPE) * 8 - bits) / 8;\
+ in += added;\
+ bits += added * 8; \
+ }
+#else
+# if defined(HOLD_64BIT_CHUNKS)
+# define TOP_UP_BITS\
+ if (bits < 33) {\
+ PREFETCH_BYTE\
+ PREFETCH_BYTE\
+ PREFETCH_BYTE\
+ PREFETCH_BYTE\
+ }\
+ if (bits < 49) {\
+ PREFETCH_BYTE\
+ PREFETCH_BYTE\
+ }\
+ if (bits < 57) {\
+ PREFETCH_BYTE\
+ }
+# else
+# define TOP_UP_BITS\
+ if (bits < 17) {\
+ PREFETCH_BYTE\
+ PREFETCH_BYTE\
+ }\
+ if (bits < 25) {\
+ PREFETCH_BYTE\
+ }
+# endif
+#endif
+
+/* For 64 bit chunks, we don't need to prefetch a second
+ * time inside the main loop when decoding the distance.
+ */
+#if defined(HOLD_64BIT_CHUNKS)
+# define TOP_UP_BITS_32
+#else
+# define TOP_UP_BITS_32 TOP_UP_BITS
+#endif
+
/*
Decode literal, length, and distance codes and write out the resulting
literal and match bytes until either not enough input or output is
@@ -40,8 +266,8 @@
Entry assumptions:
state->mode == LEN
- strm->avail_in >= 6
- strm->avail_out >= 258
+ strm->avail_in >= 8
+ strm->avail_out >= 273
start >= strm->avail_out
state->bits < 8
@@ -56,13 +282,15 @@
- The maximum input bits used by a length/distance pair is 15 bits for the
length code, 5 bits for the length extra, 15 bits for the distance code,
and 13 bits for the distance extra. This totals 48 bits, or six bytes.
- Therefore if strm->avail_in >= 6, then there is enough input to avoid
- checking for available input while decoding.
+ However, we prefetch 1x8 or 2x4 bytes. Therefore if strm->avail_in >= 8
+ is always true, then there is enough input to avoid checking for
available
+ input while decoding.
- The maximum bytes that a single length/distance pair can output is 258
- bytes, which is the maximum length that can be coded. inflate_fast()
- requires strm->avail_out >= 258 for each loop to avoid checking for
- output space.
+ bytes, which is the maximum length that can be coded. Another 15 bytes
+ padding are required to simplify copying in chunks of up to 16 bytes.
+ inflate_fast() requires strm->avail_out >= 273 for each loop to avoid
+ checking for output space.
*/
void ZLIB_INTERNAL inflate_fast(strm, start)
z_streamp strm;
@@ -81,8 +309,9 @@ unsigned start; /* inflate()'s s
unsigned whave; /* valid bytes in the window */
unsigned wnext; /* window write index */
unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */
- unsigned long hold; /* local strm->hold */
+ HOLD_TYPE hold; /* local strm->hold */
unsigned bits; /* local strm->bits */
+ unsigned added; /* number of bytes fetched in TOP_UP_BITS */
code const FAR *lcode; /* local strm->lencode */
code const FAR *dcode; /* local strm->distcode */
unsigned lmask; /* mask for first level of length codes */
@@ -97,10 +326,10 @@ unsigned start; /* inflate()'s s
/* copy state to local variables */
state = (struct inflate_state FAR *)strm->state;
in = strm->next_in - OFF;
- last = in + (strm->avail_in - 5);
+ last = in + (strm->avail_in - 7);
out = strm->next_out - OFF;
beg = out - (start - strm->avail_out);
- end = out + (strm->avail_out - 257);
+ end = out + (strm->avail_out - 272);
#ifdef INFLATE_STRICT
dmax = state->dmax;
#endif
@@ -117,61 +346,47 @@ unsigned start; /* inflate()'s s
/* decode literals and length/distances until end-of-block or not enough
input data or output space */
+ TOP_UP_BITS /* bits = 32/64 */
do {
- if (bits < 15) {
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- }
+ /* bits >= 10/10 */
here = lcode[hold & lmask];
dolen:
op = (unsigned)(here.bits);
hold >>= op;
bits -= op;
+ TOP_UP_BITS /* bits >= 25/57 */
+
op = (unsigned)(here.op);
if (op == 0) { /* literal */
Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
"inflate: literal '%c'\n" :
"inflate: literal 0x%02x\n", here.val));
PUP(out) = (unsigned char)(here.val);
+ /* bits >= 25/57 */
+ if (out + LITERAL_UNROLL_SIZE-1 < end && in < last)
+ UNROLLED_LITERAL_LOOP
+ /* bits >= 16/12 */
}
else if (op & 16) { /* length base */
len = (unsigned)(here.val);
op &= 15; /* number of extra bits */
if (op) {
- if (bits < op) {
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- }
len += (unsigned)hold & ((1U << op) - 1);
hold >>= op;
bits -= op;
}
Tracevv((stderr, "inflate: length %u\n", len));
- if (bits < 15) {
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- }
+ /* bits >= 10/42 */
here = dcode[hold & dmask];
dodist:
op = (unsigned)(here.bits);
hold >>= op;
bits -= op;
+ TOP_UP_BITS_32 /* bits >= 25/36 */
op = (unsigned)(here.op);
if (op & 16) { /* distance base */
dist = (unsigned)(here.val);
op &= 15; /* number of extra bits */
- if (bits < op) {
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- if (bits < op) {
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- }
- }
dist += (unsigned)hold & ((1U << op) - 1);
#ifdef INFLATE_STRICT
if (dist > dmax) {
@@ -182,6 +397,7 @@ unsigned start; /* inflate()'s s
#endif
hold >>= op;
bits -= op;
+ /* bits >= 10/21 */
Tracevv((stderr, "inflate: distance %u\n", dist));
op = (unsigned)(out - beg); /* max distance in output */
if (dist > op) { /* see if copy from window */
@@ -190,9 +406,9 @@ unsigned start; /* inflate()'s s
if (state->sane) {
strm->msg =
(char *)"invalid distance too far back";
- state->mode = BAD;
- break;
- }
+ state->mode = BAD;
+ break;
+ }
#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
if (len <= op - whave) {
do {
@@ -253,31 +469,10 @@ unsigned start; /* inflate()'s s
from = out - dist; /* rest from output */
}
}
- while (len > 2) {
- PUP(out) = PUP(from);
- PUP(out) = PUP(from);
- PUP(out) = PUP(from);
- len -= 3;
- }
- if (len) {
- PUP(out) = PUP(from);
- if (len > 1)
- PUP(out) = PUP(from);
- }
+ STANDARD_MIN3_COPY
}
- else {
- from = out - dist; /* copy direct from output */
- do { /* minimum length is three */
- PUP(out) = PUP(from);
- PUP(out) = PUP(from);
- PUP(out) = PUP(from);
- len -= 3;
- } while (len > 2);
- if (len) {
- PUP(out) = PUP(from);
- if (len > 1)
- PUP(out) = PUP(from);
- }
+ else { /* copy direct from output */
+ QUICK_COPY
}
}
else if ((op & 64) == 0) { /* 2nd level distance code */
@@ -304,7 +499,7 @@ unsigned start; /* inflate()'s s
state->mode = BAD;
break;
}
- } while (in < last && out < end);
+ } while (out < end && in < last);
/* return unused bytes (on entry, bits < 8, so in won't go too far back) */
len = bits >> 3;
@@ -315,9 +510,9 @@ unsigned start; /* inflate()'s s
/* update state and return */
strm->next_in = in + OFF;
strm->next_out = out + OFF;
- strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
+ strm->avail_in = (unsigned)(in < last ? 7 + (last - in) : 7 - (in - last));
strm->avail_out = (unsigned)(out < end ?
- 257 + (end - out) : 257 - (out - end));
+ 272 + (end - out) : 272 - (out - end));
state->hold = hold;
state->bits = bits;
return;
@@ -335,6 +530,26 @@ unsigned start; /* inflate()'s s
- Swapping window/direct else
- Larger unrolled copy loops (three is about right)
- Moving len -= 3 statement into middle of loop
+
+ The critical code path is the following:
+ here = lcode[hold & lmask];
+ op = (unsigned)(here.bits);
+
+ It requires
+ 2 accesses to hold and lmask (0 ticks if in register,
+ otherwise: 4 ticks typ = 1 + L1 latency)
+ +1 ALU latency (usually 1 tick)
+ +1 L1 latency (2..4 ticks, typ. 3 ticks)
+ +1 member access latency (0 ticks on some arch if 'bits' is the MSB,
+ 2 ALU ops / 2 ticks otherwise)
+ -> 4 .. 12 ticks latency
+
+ Therefore, we "splice" the data prefetch code (hold) into the critical
+ path (a good compiler will interleave the data load from TOP_UP_BITS
+ with the lcode access). All calculation be parallelized very well on
+ most architectures so that TOP_UP_BITS becomes relatively cheap at 4 or
+ less ticks overhead with no branch mispredictions possible. Also, 'hold'
+ will be readily available the next iteration.
*/
#endif /* !ASMINF */
diff -urNp zlib-1.2.5-orig/inftrees.c zlib-1.2.5/inftrees.c
--- zlib-1.2.5-orig/inftrees.c 2010-04-20 12:12:21.000000000 +0800
+++ zlib-1.2.5/inftrees.c 2010-07-26 03:53:34.000000000 +0800
@@ -138,13 +138,20 @@ unsigned short FAR *work;
return -1; /* incomplete set */
/* generate offsets into symbol table for each length for sorting */
- offs[1] = 0;
- for (len = 1; len < MAXBITS; len++)
- offs[len + 1] = offs[len] + count[len];
+ {
+ unsigned short offset = 0;
+ offs[1] = 0;
+ for (len = 1; len < MAXBITS; len++) {
+ offset += count[len];
+ offs[len + 1] = offset;
+ }
+ }
/* sort symbols by length, by symbol order within each length */
- for (sym = 0; sym < codes; sym++)
- if (lens[sym] != 0) work[offs[lens[sym]]++] = (unsigned short)sym;
+ for (sym = 0; sym < codes; sym++) {
+ unsigned len = lens[sym];
+ if (len != 0) work[offs[len]++] = (unsigned short)sym;
+ }
/*
Create and fill in decoding tables. In this loop, the table being
@@ -215,14 +222,15 @@ unsigned short FAR *work;
/* process all codes and make table entries */
for (;;) {
/* create table entry */
+ unsigned work_sym = work[sym];
here.bits = (unsigned char)(len - drop);
- if ((int)(work[sym]) < end) {
+ if ((int)(work_sym) < end) {
here.op = (unsigned char)0;
- here.val = work[sym];
+ here.val = work_sym;
}
- else if ((int)(work[sym]) > end) {
- here.op = (unsigned char)(extra[work[sym]]);
- here.val = base[work[sym]];
+ else if ((int)(work_sym) > end) {
+ here.op = (unsigned char)(extra[work_sym]);
+ here.val = base[work_sym];
}
else {
here.op = (unsigned char)(32 + 64); /* end of block */
diff -urNp zlib-1.2.5-orig/trees.c zlib-1.2.5/trees.c
--- zlib-1.2.5-orig/trees.c 2010-04-19 12:03:44.000000000 +0800
+++ zlib-1.2.5/trees.c 2010-07-26 03:53:34.000000000 +0800
@@ -175,15 +175,6 @@ local void gen_trees_header OF((void));
#endif
/* ===========================================================================
- * Output a short LSB first on the stream.
- * IN assertion: there is enough room in pendingBuf.
- */
-#define put_short(s, w) { \
- put_byte(s, (uch)((w) & 0xff)); \
- put_byte(s, (uch)((ush)(w) >> 8)); \
-}
-
-/* ===========================================================================
* Send a value on a given number of bits.
* IN assertion: length <= 16 and value fits in length bits.
*/
@@ -203,29 +194,23 @@ local void send_bits(s, value, length)
* (16 - bi_valid) bits from value, leaving (width - (16-bi_valid))
* unused bits in value.
*/
- if (s->bi_valid > (int)Buf_size - length) {
- s->bi_buf |= (ush)value << s->bi_valid;
+ s->bi_buf |= (ulg)value << s->bi_valid;
+ s->bi_valid += (ulg)length;
+ if (s->bi_valid > Buf_size) {
put_short(s, s->bi_buf);
- s->bi_buf = (ush)value >> (Buf_size - s->bi_valid);
- s->bi_valid += length - Buf_size;
- } else {
- s->bi_buf |= (ush)value << s->bi_valid;
- s->bi_valid += length;
- }
+ s->bi_buf >>= Buf_size;
+ s->bi_valid -= Buf_size;
+ }
}
#else /* !DEBUG */
#define send_bits(s, value, length) \
-{ int len = length;\
- if (s->bi_valid > (int)Buf_size - len) {\
- int val = value;\
- s->bi_buf |= (ush)val << s->bi_valid;\
+{ s->bi_buf |= (ulg)(value) << s->bi_valid;\
+ s->bi_valid += (ulg)(length);\
+ if (s->bi_valid > Buf_size) {\
put_short(s, s->bi_buf);\
- s->bi_buf = (ush)val >> (Buf_size - s->bi_valid);\
- s->bi_valid += len - Buf_size;\
- } else {\
- s->bi_buf |= (ush)(value) << s->bi_valid;\
- s->bi_valid += len;\
+ s->bi_buf >>= Buf_size;\
+ s->bi_valid -= Buf_size;\
}\
}
#endif /* DEBUG */
@@ -1154,7 +1139,7 @@ local int detect_data_type(s)
|| s->dyn_ltree[13].Freq != 0)
return Z_TEXT;
for (n = 32; n < LITERALS; n++)
- if (s->dyn_ltree[n].Freq != 0)
+ if (s->dyn_ltree[n].Freq != 0)
return Z_TEXT;
/* There are no "black-listed" or "white-listed" bytes:
diff -urNp zlib-1.2.5-orig/zconf.h zlib-1.2.5/zconf.h
--- zlib-1.2.5-orig/zconf.h 2010-04-19 01:58:06.000000000 +0800
+++ zlib-1.2.5/zconf.h 2010-07-26 03:53:34.000000000 +0800
@@ -160,10 +160,52 @@
#ifdef SYS16BIT
# define MAXSEG_64K
#endif
-#ifdef MSDOS
+
+/*
+ * Many machines allow efficient access to unaligned data, that is
+ * reading 2 or more bytes at once from a random and possibly unaligned
+ * memory address is *on average* more efficient than reading the data
+ * one byte at a time and then combining it.
+ */
+#if !defined(UNALIGNED_OK) && defined(MSDOS)
+# define UNALIGNED_OK
+#endif
+#if !defined(UNALIGNED_OK) && (defined(_M_IX86) || defined(_M_X64))
# define UNALIGNED_OK
#endif
+#if !defined(UNALIGNED_OK) && (defined(i386) || defined(__x86_64))
+# define UNALIGNED_OK
+#endif
+
+/*
+ * Most information in compressed data streams is stored in LSB first
+ * (little endian) order. If that matches the machine byte order, we may
+ * apply certain optimizations.
+ */
+#if !defined(LITTLE_ENDIAN) && (defined(_M_IX86) || defined(_M_X64))
+# define LITTLE_ENDIAN
+#endif
+#if !defined(LITTLE_ENDIAN) && (defined(i386) || defined(__x86_64))
+# define LITTLE_ENDIAN
+#endif
+#if !defined(LITTLE_ENDIAN) && defined(__LITTLE_ENDIAN__)
+# define LITTLE_ENDIAN
+#endif
+
+/*
+ * With the availability of SSE2, we can optimize certain functions
+ * by operating on large chunks of data at once.
+ */
+#if !defined(USE_SSE2) && defined(__GNUC__) && defined(__SSE2__)
+# define USE_SSE2
+#endif
+#if !defined(USE_SSE2) && (defined(_M_X64) || (defined(_M_IX86_FP) &&
(_M_IX86_FP>=2)))
+# define USE_SSE2
+#endif
+/*
+ * C standard level.
+ */
#ifdef __STDC_VERSION__
# ifndef STDC
# define STDC
_______________________________________________
linaro-dev mailing list
[email protected]
http://lists.linaro.org/mailman/listinfo/linaro-dev