From: Yikun Jiang <yikunk...@gmail.com> This patch uses the prefetch instruction to pre-load the next_match into cache to improve the performance, also makes an unrolling change to decrease the number of if branch usage. --- deflate.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/deflate.c b/deflate.c index 5ed2a9b..008c032 100644 --- a/deflate.c +++ b/deflate.c @@ -378,6 +378,9 @@ longest_match(IPos cur_match) register int len; /* length of current match */ int best_len = prev_length; /* best match length so far */ IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST : NIL; +#ifdef __aarch64__ + IPos next_match; +#endif /* Stop when cur_match becomes <= limit. To simplify the code, * we prevent matches with the string of window index 0. */ @@ -411,6 +414,10 @@ longest_match(IPos cur_match) do { Assert(cur_match < strstart, "no future"); match = window + cur_match; +#ifdef __aarch64__ + next_match = prev[cur_match & WMASK]; + __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK]))); +#endif /* Skip to next match if the match length cannot increase * or if the match length is less than 2: @@ -488,8 +495,14 @@ longest_match(IPos cur_match) scan_end = scan[best_len]; #endif } - } while ((cur_match = prev[cur_match & WMASK]) > limit - && --chain_length != 0); + } +#ifdef __aarch64__ + while ((cur_match = next_match) > limit + && --chain_length != 0); +#else + while ((cur_match = prev[cur_match & WMASK]) > limit + && --chain_length != 0); +#endif return best_len; } @@ -777,7 +790,20 @@ deflate (int pack_level) lookahead -= prev_length-1; prev_length -= 2; RSYNC_ROLL(strstart, prev_length+1); + + while (prev_length >= 4) { + prev_length -= 4; + strstart++; + INSERT_STRING(strstart, hash_head); + strstart++; + INSERT_STRING(strstart, hash_head); + strstart++; + INSERT_STRING(strstart, hash_head); + strstart++; + INSERT_STRING(strstart, hash_head); + } do { + if (prev_length == 0) break; strstart++; INSERT_STRING(strstart, hash_head); /* strstart never exceeds WSIZE-MAX_MATCH, so there are -- 2.17.1