Re: pglz compression performance, take two

Justin Pryzby Thu, 21 Jan 2021 18:48:38 -0800

@cfbot: rebased

>From 03fec5d2587cf34a1d1a75d7afdcfbad9cb7ec68 Mon Sep 17 00:00:00 2001
From: Andrey <amboro...@acm.org>
Date: Thu, 27 Jun 2019 23:18:21 +0500
Subject: [PATCH] Reorganize pglz compression code


This patch accumulates several changes:
1. Convert macro-functions to regular functions for readability
2. Use more compact hash table with uint16 indexes instead of pointers
3. Avoid prev pointer in hash table
4. Use 4-byte comparisons in a search instead of 1-byte
Total speedup about x1.4
---
 src/common/pg_lzcompress.c | 475 ++++++++++++++++++++++---------------
 1 file changed, 288 insertions(+), 187 deletions(-)

diff --git a/src/common/pg_lzcompress.c b/src/common/pg_lzcompress.c
index fdd527f757..ffa4fe549e 100644
--- a/src/common/pg_lzcompress.c
+++ b/src/common/pg_lzcompress.c
@@ -118,13 +118,13 @@
  *			our 4K maximum look-back distance is too small.
  *
  *			The compressor creates a table for lists of positions.
- *			For each input position (except the last 3), a hash key is
+ *			For each input position (except the last 4), a hash key is
  *			built from the 4 next input bytes and the position remembered
  *			in the appropriate list. Thus, the table points to linked
  *			lists of likely to be at least in the first 4 characters
  *			matching strings. This is done on the fly while the input
  *			is compressed into the output area.  Table entries are only
- *			kept for the last 4096 input positions, since we cannot use
+ *			kept for the last 4094 input positions, since we cannot use
  *			back-pointers larger than that anyway.  The size of the hash
  *			table is chosen based on the size of the input - a larger table
  *			has a larger startup cost, as it needs to be initialized to
@@ -146,17 +146,15 @@
  *			used for the next tag in the output.
  *
  *			For each subsequent entry in the history list, the "good_match"
- *			is lowered by 10%. So the compressor will be more happy with
- *			short matches the farer it has to go back in the history.
+ *			is lowered roughly by 10%. So the compressor will be more happy
+ *			with short matches the further it has to go back in the history.
  *			Another "speed against ratio" preference characteristic of
  *			the algorithm.
  *
- *			Thus there are 3 stop conditions for the lookup of matches:
+ *			Thus there are 2 stop conditions for the lookup of matches:
  *
  *				- a match >= good_match is found
  *				- there are no more history entries to look at
- *				- the next history entry is already too far back
- *				  to be coded into a tag.
  *
  *			Finally the match algorithm checks that at least a match
  *			of 3 or more bytes has been found, because that is the smallest
@@ -173,6 +171,10 @@
  *			Jan Wieck
  *
  * Copyright (c) 1999-2021, PostgreSQL Global Development Group
+ *			Many thanks to Yann Collet for his article about unaligned
+ *			memory access.
+ *
+ *			Leskov Vladimir
  *
  * src/common/pg_lzcompress.c
  * ----------
@@ -188,12 +190,57 @@
 #include "common/pg_lzcompress.h"
 
 
+/**************************************
+ *  CPU Feature Detection			  *
+ **************************************/
+/* PGLZ_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : direct access. This method is portable but violate C standard.
+ *			It can generate buggy code on targets which assembly generation depends on alignment.
+ *			But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1)
+ */
+#ifndef PGLZ_FORCE_MEMORY_ACCESS	/* can be defined externally */
+#if defined(__GNUC__) && \
+  ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \
+  || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#define PGLZ_FORCE_MEMORY_ACCESS 1
+#endif
+#endif
+
+#if defined(PGLZ_FORCE_MEMORY_ACCESS) && (PGLZ_FORCE_MEMORY_ACCESS==1)
+/* lie to the compiler about data alignment; use with caution */
+
+static uint32
+pglz_read32(const void *ptr)
+{
+	return *(const uint32 *) ptr;
+}
+
+#else							/* safe and portable access using memcpy() */
+
+static uint32
+pglz_read32(const void *ptr)
+{
+	uint32		val;
+
+	memcpy(&val, ptr, sizeof(val));
+	return val;
+}
+
+#endif							/* PGLZ_FORCE_MEMORY_ACCESS */
+
+
 /* ----------
  * Local definitions
  * ----------
  */
 #define PGLZ_MAX_HISTORY_LISTS	8192	/* must be power of 2 */
-#define PGLZ_HISTORY_SIZE		4096
+#define PGLZ_HISTORY_SIZE		0x0fff - 1	/* to avoid compare in iteration */
 #define PGLZ_MAX_MATCH			273
 
 
@@ -202,17 +249,16 @@
  *
  *		Linked list for the backward history lookup
  *
- * All the entries sharing a hash key are linked in a doubly linked list.
- * This makes it easy to remove an entry when it's time to recycle it
- * (because it's more than 4K positions old).
+ * All the entries sharing a hash key are linked in a singly linked list.
+ * Links are not changed during insertion in order to speed it up.
+ * Instead more complicated stop condition is used during list iteration.
  * ----------
  */
 typedef struct PGLZ_HistEntry
 {
-	struct PGLZ_HistEntry *next;	/* links for my hash key's list */
-	struct PGLZ_HistEntry *prev;
-	int			hindex;			/* my current hash key */
-	const char *pos;			/* my input position */
+	int16		next_id;		/* links for my hash key's list */
+	uint16		hist_idx;		/* my current hash key */
+	const unsigned char *pos;	/* my input position */
 } PGLZ_HistEntry;
 
 
@@ -256,11 +302,9 @@ static int16 hist_start[PGLZ_MAX_HISTORY_LISTS];
 static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE + 1];
 
 /*
- * Element 0 in hist_entries is unused, and means 'invalid'. Likewise,
- * INVALID_ENTRY_PTR in next/prev pointers mean 'invalid'.
+ * Element 0 in hist_entries is unused, and means 'invalid'.
  */
 #define INVALID_ENTRY			0
-#define INVALID_ENTRY_PTR		(&hist_entries[INVALID_ENTRY])
 
 /* ----------
  * pglz_hist_idx -
@@ -274,117 +318,111 @@ static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE + 1];
  * hash keys more.
  * ----------
  */
-#define pglz_hist_idx(_s,_e, _mask) (										\
-			((((_e) - (_s)) < 4) ? (int) (_s)[0] :							\
-			 (((_s)[0] << 6) ^ ((_s)[1] << 4) ^								\
-			  ((_s)[2] << 2) ^ (_s)[3])) & (_mask)				\
-		)
+static inline uint16
+pglz_hist_idx(const unsigned char *s, uint16 mask)
+{
+	/*
+	 * Note that we only calculate function at the beginning of compressed data.
+	 * Further hash valued are computed by subtracting prev byte and adding next.
+	 * For details see pglz_hist_add().
+	 */
+	return ((s[0] << 6) ^ (s[1] << 4) ^ (s[2] << 2) ^ s[3]) & mask;
+}
 
 
 /* ----------
  * pglz_hist_add -
  *
- *		Adds a new entry to the history table.
- *
- * If _recycle is true, then we are recycling a previously used entry,
- * and must first delink it from its old hashcode's linked list.
- *
- * NOTE: beware of multiple evaluations of macro's arguments, and note that
- * _hn and _recycle are modified in the macro.
+ *		Adds a new entry to the history table
+ *		and recalculates hash value.
  * ----------
  */
-#define pglz_hist_add(_hs,_he,_hn,_recycle,_s,_e, _mask)	\
-do {									\
-			int __hindex = pglz_hist_idx((_s),(_e), (_mask));				\
-			int16 *__myhsp = &(_hs)[__hindex];								\
-			PGLZ_HistEntry *__myhe = &(_he)[_hn];							\
-			if (_recycle) {													\
-				if (__myhe->prev == NULL)									\
-					(_hs)[__myhe->hindex] = __myhe->next - (_he);			\
-				else														\
-					__myhe->prev->next = __myhe->next;						\
-				if (__myhe->next != NULL)									\
-					__myhe->next->prev = __myhe->prev;						\
-			}																\
-			__myhe->next = &(_he)[*__myhsp];								\
-			__myhe->prev = NULL;											\
-			__myhe->hindex = __hindex;										\
-			__myhe->pos  = (_s);											\
-			/* If there was an existing entry in this hash slot, link */	\
-			/* this new entry to it. However, the 0th entry in the */		\
-			/* entries table is unused, so we can freely scribble on it. */ \
-			/* So don't bother checking if the slot was used - we'll */		\
-			/* scribble on the unused entry if it was not, but that's */	\
-			/* harmless. Avoiding the branch in this critical path */		\
-			/* speeds this up a little bit. */								\
-			/* if (*__myhsp != INVALID_ENTRY) */							\
-				(_he)[(*__myhsp)].prev = __myhe;							\
-			*__myhsp = _hn;													\
-			if (++(_hn) >= PGLZ_HISTORY_SIZE + 1) {							\
-				(_hn) = 1;													\
-				(_recycle) = true;											\
-			}																\
-} while (0)
+static inline int16
+pglz_hist_add(int16 hist_next, uint16 *hist_idx, const unsigned char *s, uint16 mask)
+{
+	int16	   *my_hist_start = &hist_start[*hist_idx];
+	PGLZ_HistEntry *entry = &(hist_entries)[hist_next];
 
+	/*
+	 * Initialize entry with a new value.
+	 */
+	entry->next_id = *my_hist_start;
+	entry->hist_idx = *hist_idx;
+	entry->pos = s;
 
-/* ----------
- * pglz_out_ctrl -
- *
- *		Outputs the last and allocates a new control byte if needed.
- * ----------
- */
-#define pglz_out_ctrl(__ctrlp,__ctrlb,__ctrl,__buf) \
-do { \
-	if ((__ctrl & 0xff) == 0)												\
-	{																		\
-		*(__ctrlp) = __ctrlb;												\
-		__ctrlp = (__buf)++;												\
-		__ctrlb = 0;														\
-		__ctrl = 1;															\
-	}																		\
-} while (0)
+	/*
+	 * Update linked list head pointer.
+	 */
+	*my_hist_start = hist_next;
+
+	/*
+	 * Recalculate hash value for next position. Remove current byte and add next.
+	 */
+	*hist_idx = ((((*hist_idx) ^ (s[0] << 6)) << 2) ^ s[4]) & mask;
+
+	/*
+	 * Shift history pointer.
+	 */
+	hist_next++;
+	if (hist_next == PGLZ_HISTORY_SIZE + 1)
+	{
+		hist_next = 1;
+	}
+	return hist_next;
+}
 
 
 /* ----------
- * pglz_out_literal -
+ * pglz_out_tag -
  *
- *		Outputs a literal byte to the destination buffer including the
+ *		Outputs a backward reference tag of 2-4 bytes (depending on
+ *		offset and length) to the destination buffer including the
  *		appropriate control bit.
  * ----------
  */
-#define pglz_out_literal(_ctrlp,_ctrlb,_ctrl,_buf,_byte) \
-do { \
-	pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf);								\
-	*(_buf)++ = (unsigned char)(_byte);										\
-	_ctrl <<= 1;															\
-} while (0)
+static inline unsigned char *
+pglz_out_tag(unsigned char *dest_ptr, int32 match_len, int32 match_offset)
+{
+	if (match_len > 17)
+	{
+		*(dest_ptr++) = (unsigned char) ((((match_offset) & 0xf00) >> 4) | 0x0f);
+		*(dest_ptr++) = (unsigned char) (((match_offset) & 0xff));
+		*(dest_ptr++) = (unsigned char) ((match_len) - 18);
+	}
+	else
+	{
+		*(dest_ptr++) = (unsigned char) ((((match_offset) & 0xf00) >> 4) | ((match_len) - 3));
+		*(dest_ptr++) = (unsigned char) ((match_offset) & 0xff);
+	}
+	return dest_ptr;
+}
 
 
 /* ----------
- * pglz_out_tag -
+ * pglz_compare -
  *
- *		Outputs a backward reference tag of 2-4 bytes (depending on
- *		offset and length) to the destination buffer including the
- *		appropriate control bit.
+ *		Compares two sequences of bytes
+ *		and returns position of first mismatch.
  * ----------
  */
-#define pglz_out_tag(_ctrlp,_ctrlb,_ctrl,_buf,_len,_off) \
-do { \
-	pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf);								\
-	_ctrlb |= _ctrl;														\
-	_ctrl <<= 1;															\
-	if (_len > 17)															\
-	{																		\
-		(_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | 0x0f);		\
-		(_buf)[1] = (unsigned char)(((_off) & 0xff));						\
-		(_buf)[2] = (unsigned char)((_len) - 18);							\
-		(_buf) += 3;														\
-	} else {																\
-		(_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | ((_len) - 3)); \
-		(_buf)[1] = (unsigned char)((_off) & 0xff);							\
-		(_buf) += 2;														\
-	}																		\
-} while (0)
+static inline int32
+pglz_compare(int32 len, int32 len_bound, const unsigned char *input_pos,
+			 const unsigned char *hist_pos)
+{
+	while (len <= len_bound - 4 && pglz_read32(input_pos) == pglz_read32(hist_pos))
+	{
+		len += 4;
+		input_pos += 4;
+		hist_pos += 4;
+	}
+	while (len < len_bound && *input_pos == *hist_pos)
+	{
+		len++;
+		input_pos++;
+		hist_pos++;
+	}
+	return len;
+}
 
 
 /* ----------
@@ -396,32 +434,40 @@ do { \
  * ----------
  */
 static inline int
-pglz_find_match(int16 *hstart, const char *input, const char *end,
-				int *lenp, int *offp, int good_match, int good_drop, int mask)
+pglz_find_match(uint16 hist_idx, const unsigned char *input, const unsigned char *end,
+				int *lenp, int *offp, int good_match, int good_drop)
 {
-	PGLZ_HistEntry *hent;
-	int16		hentno;
+	PGLZ_HistEntry *hist_entry;
+	int16	   *hist_entry_number;
 	int32		len = 0;
-	int32		off = 0;
+	int32		offset = 0;
+	int32		cur_len = 0;
+	int32		len_bound = Min(end - input, PGLZ_MAX_MATCH);
 
 	/*
 	 * Traverse the linked history list until a good enough match is found.
 	 */
-	hentno = hstart[pglz_hist_idx(input, end, mask)];
-	hent = &hist_entries[hentno];
-	while (hent != INVALID_ENTRY_PTR)
-	{
-		const char *ip = input;
-		const char *hp = hent->pos;
-		int32		thisoff;
-		int32		thislen;
+	hist_entry_number = &hist_start[hist_idx];
+	if (*hist_entry_number == INVALID_ENTRY)
+		return 0;
 
+	hist_entry = &hist_entries[*hist_entry_number];
+	if (hist_idx != hist_entry->hist_idx)
+	{
 		/*
-		 * Stop if the offset does not fit into our tag anymore.
+		 * If current linked list head points to invalid entry then clear it
+		 * to reduce the number of comparisons in future.
 		 */
-		thisoff = ip - hp;
-		if (thisoff >= 0x0fff)
-			break;
+		*hist_entry_number = INVALID_ENTRY;
+		return 0;
+	}
+
+	while (true)
+	{
+		const unsigned char *input_pos = input;
+		const unsigned char *hist_pos = hist_entry->pos;
+		const unsigned char *my_pos;
+		int32		cur_offset = input_pos - hist_pos;
 
 		/*
 		 * Determine length of match. A better match must be larger than the
@@ -431,58 +477,60 @@ pglz_find_match(int16 *hstart, const char *input, const char *end,
 		 * character by character comparison to know the exact position where
 		 * the diff occurred.
 		 */
-		thislen = 0;
 		if (len >= 16)
 		{
-			if (memcmp(ip, hp, len) == 0)
+			if (memcmp(input_pos, hist_pos, len) == 0)
 			{
-				thislen = len;
-				ip += len;
-				hp += len;
-				while (ip < end && *ip == *hp && thislen < PGLZ_MAX_MATCH)
-				{
-					thislen++;
-					ip++;
-					hp++;
-				}
+				offset = cur_offset;
+				len = pglz_compare(len, len_bound, input_pos + len, hist_pos + len);
 			}
 		}
 		else
 		{
-			while (ip < end && *ip == *hp && thislen < PGLZ_MAX_MATCH)
+			/*
+			 * Start search for short matches by comparing 4 bytes
+			 */
+			if (pglz_read32(input_pos) == pglz_read32(hist_pos))
 			{
-				thislen++;
-				ip++;
-				hp++;
+				cur_len = pglz_compare(4, len_bound, input_pos + 4, hist_pos + 4);
+				if (cur_len > len)
+				{
+					len = cur_len;
+					offset = cur_offset;
+				}
 			}
 		}
 
-		/*
-		 * Remember this match as the best (if it is)
-		 */
-		if (thislen > len)
-		{
-			len = thislen;
-			off = thisoff;
-		}
-
 		/*
 		 * Advance to the next history entry
 		 */
-		hent = hent->next;
+		my_pos = hist_entry->pos;
+		hist_entry = &hist_entries[hist_entry->next_id];
 
 		/*
-		 * Be happy with lesser good matches the more entries we visited. But
-		 * no point in doing calculation if we're at end of list.
+		 * If current match length is ok then stop iteration. As outdated list
+		 * links are not updated during insertion process then additional stop
+		 * condition should be introduced to avoid following them. If recycled
+		 * entry has another hash, then iteration stops. If it has the same
+		 * hash then recycled cell would break input stream position
+		 * monotonicity, which is checked after.
 		 */
-		if (hent != INVALID_ENTRY_PTR)
+		if (len >= good_match || hist_idx != hist_entry->hist_idx || my_pos <= hist_entry->pos)
 		{
-			if (len >= good_match)
-				break;
-			good_match -= (good_match * good_drop) / 100;
+			break;
 		}
+
+		/*
+		 * Be happy with less good matches the more entries we visited.
+		 */
+		good_match -= (good_match * good_drop) >> 7;
 	}
 
+	/*
+	 * found match can be slightly more than necessary, bound it with len_bound
+	 */
+	len = Min(len, len_bound);
+
 	/*
 	 * Return match information only if it results at least in one byte
 	 * reduction.
@@ -490,7 +538,7 @@ pglz_find_match(int16 *hstart, const char *input, const char *end,
 	if (len > 2)
 	{
 		*lenp = len;
-		*offp = off;
+		*offp = offset;
 		return 1;
 	}
 
@@ -509,26 +557,28 @@ int32
 pglz_compress(const char *source, int32 slen, char *dest,
 			  const PGLZ_Strategy *strategy)
 {
-	unsigned char *bp = (unsigned char *) dest;
-	unsigned char *bstart = bp;
-	int			hist_next = 1;
-	bool		hist_recycle = false;
-	const char *dp = source;
-	const char *dend = source + slen;
-	unsigned char ctrl_dummy = 0;
-	unsigned char *ctrlp = &ctrl_dummy;
-	unsigned char ctrlb = 0;
-	unsigned char ctrl = 0;
+	unsigned char *dest_ptr = (unsigned char *) dest;
+	unsigned char *dest_start = dest_ptr;
+	uint16		hist_next = 1;
+	uint16		hist_idx;
+	const unsigned char *src_ptr = (const unsigned char *) source;
+	const unsigned char *src_end = (const unsigned char *) source + slen;
+	const unsigned char *compress_src_end = src_end - 4;
+	unsigned char control_dummy = 0;
+	unsigned char *control_ptr = &control_dummy;
+	unsigned char control_byte = 0;
+	unsigned char control_pos = 0;
 	bool		found_match = false;
 	int32		match_len;
-	int32		match_off;
+	int32		match_offset;
 	int32		good_match;
 	int32		good_drop;
 	int32		result_size;
 	int32		result_max;
 	int32		need_rate;
 	int			hashsz;
-	int			mask;
+	uint16		mask;
+
 
 	/*
 	 * Our fallback strategy is the default.
@@ -560,6 +610,9 @@ pglz_compress(const char *source, int32 slen, char *dest,
 	else if (good_drop > 100)
 		good_drop = 100;
 
+	/* We use <<7 later to calculate actual drop, so align percents to 128 */
+	good_drop = good_drop * 128 / 100;
+
 	need_rate = strategy->min_comp_rate;
 	if (need_rate < 0)
 		need_rate = 0;
@@ -577,7 +630,9 @@ pglz_compress(const char *source, int32 slen, char *dest,
 		result_max = (slen / 100) * (100 - need_rate);
 	}
 	else
+	{
 		result_max = (slen * (100 - need_rate)) / 100;
+	}
 
 	/*
 	 * Experiments suggest that these hash sizes work pretty well. A large
@@ -603,10 +658,21 @@ pglz_compress(const char *source, int32 slen, char *dest,
 	 */
 	memset(hist_start, 0, hashsz * sizeof(int16));
 
+	/*
+	 * Initialize INVALID_ENTRY for stopping during lookup.
+	 */
+	hist_entries[INVALID_ENTRY].pos = src_end;
+	hist_entries[INVALID_ENTRY].hist_idx = hashsz;
+
+	/*
+	 * Calculate initial hash value.
+	 */
+	hist_idx = pglz_hist_idx(src_ptr, mask);
+
 	/*
 	 * Compress the source directly into the output buffer.
 	 */
-	while (dp < dend)
+	while (src_ptr < compress_src_end)
 	{
 		/*
 		 * If we already exceeded the maximum result size, fail.
@@ -615,7 +681,7 @@ pglz_compress(const char *source, int32 slen, char *dest,
 		 * bytes (a control byte and 3-byte tag), PGLZ_MAX_OUTPUT() had better
 		 * allow 4 slop bytes.
 		 */
-		if (bp - bstart >= result_max)
+		if (dest_ptr - dest_start >= result_max)
 			return -1;
 
 		/*
@@ -624,27 +690,36 @@ pglz_compress(const char *source, int32 slen, char *dest,
 		 * reasonably quickly when looking at incompressible input (such as
 		 * pre-compressed data).
 		 */
-		if (!found_match && bp - bstart >= strategy->first_success_by)
+		if (!found_match && dest_ptr - dest_start >= strategy->first_success_by)
 			return -1;
 
+		/*
+		 * Refresh control byte if needed.
+		 */
+		if ((control_pos & 0xff) == 0)
+		{
+			*(control_ptr) = control_byte;
+			control_ptr = (dest_ptr)++;
+			control_byte = 0;
+			control_pos = 1;
+		}
+
 		/*
 		 * Try to find a match in the history
 		 */
-		if (pglz_find_match(hist_start, dp, dend, &match_len,
-							&match_off, good_match, good_drop, mask))
+		if (pglz_find_match(hist_idx, src_ptr, compress_src_end, &match_len,
+							&match_offset, good_match, good_drop))
 		{
 			/*
 			 * Create the tag and add history entries for all matched
 			 * characters.
 			 */
-			pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off);
+			control_byte |= control_pos;
+			dest_ptr = pglz_out_tag(dest_ptr, match_len, match_offset);
 			while (match_len--)
 			{
-				pglz_hist_add(hist_start, hist_entries,
-							  hist_next, hist_recycle,
-							  dp, dend, mask);
-				dp++;			/* Do not do this ++ in the line above! */
-				/* The macro would do it four times - Jan.  */
+				hist_next = pglz_hist_add(hist_next, &hist_idx, src_ptr, mask);
+				src_ptr++;
 			}
 			found_match = true;
 		}
@@ -653,21 +728,47 @@ pglz_compress(const char *source, int32 slen, char *dest,
 			/*
 			 * No match found. Copy one literal byte.
 			 */
-			pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp);
-			pglz_hist_add(hist_start, hist_entries,
-						  hist_next, hist_recycle,
-						  dp, dend, mask);
-			dp++;				/* Do not do this ++ in the line above! */
-			/* The macro would do it four times - Jan.  */
+			hist_next = pglz_hist_add(hist_next, &hist_idx, src_ptr, mask);
+			*(dest_ptr)++ = (unsigned char) (*src_ptr);
+			src_ptr++;
+		}
+		control_pos <<= 1;
+	}
+
+
+	while (src_ptr < src_end)
+	{
+		/*
+		 * If we already exceeded the maximum result size, fail.
+		 *
+		 * We check once per loop; since the loop body could emit as many as 4
+		 * bytes (a control byte and 3-byte tag), PGLZ_MAX_OUTPUT() had better
+		 * allow 4 slop bytes.
+		 */
+		if (dest_ptr - dest_start >= result_max)
+			return -1;
+
+		/*
+		 * Refresh control byte if needed.
+		 */
+		if ((control_pos & 0xff) == 0)
+		{
+			*(control_ptr) = control_byte;
+			control_ptr = (dest_ptr)++;
+			control_byte = 0;
+			control_pos = 1;
 		}
+		*(dest_ptr)++ = (unsigned char) (*src_ptr);
+		src_ptr++;
+		control_pos <<= 1;
 	}
 
 	/*
 	 * Write out the last control byte and check that we haven't overrun the
 	 * output size allowed by the strategy.
 	 */
-	*ctrlp = ctrlb;
-	result_size = bp - bstart;
+	*control_ptr = control_byte;
+	result_size = dest_ptr - dest_start;
 	if (result_size >= result_max)
 		return -1;
 
-- 
2.17.0

Re: pglz compression performance, take two

Reply via email to