------- Original Message -------
On Friday, March 24th, 2023 at 10:30 AM, gkokola...@pm.me <gkokola...@pm.me>
wrote:
>
> ------- Original Message -------
> On Thursday, March 23rd, 2023 at 6:10 PM, Tomas Vondra
> tomas.von...@enterprisedb.com wrote:
>
> > This leaves the empty-data issue (which we have a fix for) and the
> > switch to LZ4F. And then the zstd part.
>
> Please expect promptly a patch for the switch to frames.
Please find the expected patch attached. Note that the bulk of the
patch is code unification, variable renaming to something more
appropriate, and comment addition. These are changes that are not
strictly necessary to switch to LZ4F. I do believe that are
essential for code hygiene after the switch and they do belong
on the same commit.
Cheers,
//Georgios
>
> Cheers,
> //Georgios
From c289fb8d49b680ad180a44b20fff1dc9553b7494 Mon Sep 17 00:00:00 2001
From: Georgios Kokolatos <gkokola...@pm.me>
Date: Tue, 28 Mar 2023 15:48:06 +0000
Subject: [PATCH v1] Use LZ4 frames in pg_dump's compressor API.
This change allows for greater compaction of data, especially so in very narrow
relations, by avoiding at least a compaction header and footer per row. Since
LZ4 frames are now used by both compression APIs, some code deduplication
opportunities have become obvious and are also implemented.
Reported by: Justin Pryzby
---
src/bin/pg_dump/compress_lz4.c | 358 ++++++++++++++++++++++-----------
1 file changed, 244 insertions(+), 114 deletions(-)
diff --git a/src/bin/pg_dump/compress_lz4.c b/src/bin/pg_dump/compress_lz4.c
index fc2f4e116d..078dc35cd6 100644
--- a/src/bin/pg_dump/compress_lz4.c
+++ b/src/bin/pg_dump/compress_lz4.c
@@ -17,7 +17,6 @@
#include "compress_lz4.h"
#ifdef USE_LZ4
-#include <lz4.h>
#include <lz4frame.h>
/*
@@ -29,102 +28,279 @@
#endif
/*----------------------
- * Compressor API
- *----------------------
+ * Common to both APIs
*/
-typedef struct LZ4CompressorState
+/*
+ * State used for LZ4 (de)compression by both APIs.
+ */
+typedef struct LZ4State
{
- char *outbuf;
- size_t outsize;
-} LZ4CompressorState;
+ /*
+ * Used by the File API to keep track of the file stream.
+ */
+ FILE *fp;
+
+ LZ4F_preferences_t prefs;
+
+ LZ4F_compressionContext_t ctx;
+ LZ4F_decompressionContext_t dtx;
+
+ /*
+ * Used by the File API's lazy initialization.
+ */
+ bool inited;
+
+ /*
+ * Used by the File API to distinguish between compression
+ * and decompression operations.
+ */
+ bool compressing;
+
+ /*
+ * Used by the Compressor API to mark if the compression
+ * headers have been written after initialization.
+ */
+ bool needs_header_flush;
+
+ size_t buflen;
+ char *buffer;
+
+ /*
+ * Used by the File API to store already uncompressed
+ * data that the caller has not consumed.
+ */
+ size_t overflowalloclen;
+ size_t overflowlen;
+ char *overflowbuf;
+
+ /*
+ * Used by both APIs to keep track of the compressed
+ * data length stored in the buffer.
+ */
+ size_t compressedlen;
+
+ /*
+ * Used by both APIs to keep track of error codes.
+ */
+ size_t errcode;
+} LZ4State;
+
+/*
+ * Initialize the required LZ4State members for compression. Write the LZ4 frame
+ * header in a buffer keeping track of its length. Users of this function can
+ * choose when and how to write the header to a file stream.
+ *
+ * Returns true on success. In case of a failure returns false, and stores the
+ * error code in state->errcode.
+ */
+static bool
+LZ4_compression_state_init(LZ4State *state)
+{
+ size_t status;
+
+ state->buflen = LZ4F_compressBound(DEFAULT_IO_BUFFER_SIZE, &state->prefs);
+
+ /*
+ * LZ4F_compressBegin requires a buffer that is greater or equal to
+ * LZ4F_HEADER_SIZE_MAX. Verify that the requirement is met.
+ */
+ if (state->buflen < LZ4F_HEADER_SIZE_MAX)
+ state->buflen = LZ4F_HEADER_SIZE_MAX;
+
+ status = LZ4F_createCompressionContext(&state->ctx, LZ4F_VERSION);
+ if (LZ4F_isError(status))
+ {
+ state->errcode = status;
+ return false;
+ }
+
+ state->buffer = pg_malloc(state->buflen);
+ status = LZ4F_compressBegin(state->ctx,
+ state->buffer, state->buflen,
+ &state->prefs);
+ if (LZ4F_isError(status))
+ {
+ state->errcode = status;
+ return false;
+ }
+
+ state->compressedlen = status;
+
+ return true;
+}
+
+/*----------------------
+ * Compressor API
+ *----------------------
+ */
/* Private routines that support LZ4 compressed data I/O */
-static void ReadDataFromArchiveLZ4(ArchiveHandle *AH, CompressorState *cs);
-static void WriteDataToArchiveLZ4(ArchiveHandle *AH, CompressorState *cs,
- const void *data, size_t dLen);
-static void EndCompressorLZ4(ArchiveHandle *AH, CompressorState *cs);
static void
ReadDataFromArchiveLZ4(ArchiveHandle *AH, CompressorState *cs)
{
- LZ4_streamDecode_t lz4StreamDecode;
- char *buf;
- char *decbuf;
- size_t buflen;
- size_t cnt;
-
- buflen = DEFAULT_IO_BUFFER_SIZE;
- buf = pg_malloc(buflen);
- decbuf = pg_malloc(buflen);
+ size_t r;
+ size_t readbuflen;
+ char *outbuf;
+ char *readbuf;
+ LZ4F_decompressionContext_t ctx = NULL;
+ LZ4F_decompressOptions_t dec_opt;
+ LZ4F_errorCode_t status;
+
+ memset(&dec_opt, 0, sizeof(dec_opt));
+ status = LZ4F_createDecompressionContext(&ctx, LZ4F_VERSION);
+ if (LZ4F_isError(status))
+ pg_fatal("could not create LZ4 decompression context: %s",
+ LZ4F_getErrorName(status));
+
+ outbuf = pg_malloc0(DEFAULT_IO_BUFFER_SIZE);
+ readbuf = pg_malloc0(DEFAULT_IO_BUFFER_SIZE);
+ readbuflen = DEFAULT_IO_BUFFER_SIZE;
+ while ((r = cs->readF(AH, &readbuf, &readbuflen)) > 0)
+ {
+ char *readp;
+ char *readend;
- LZ4_setStreamDecode(&lz4StreamDecode, NULL, 0);
+ /* Process one chunk */
+ readp = readbuf;
+ readend = readbuf + r;
+ while (readp < readend)
+ {
+ size_t out_size = DEFAULT_IO_BUFFER_SIZE;
+ size_t read_size = readend - readp;
- while ((cnt = cs->readF(AH, &buf, &buflen)))
- {
- int decBytes = LZ4_decompress_safe_continue(&lz4StreamDecode,
- buf, decbuf,
- cnt, buflen);
+ memset(outbuf, 0, DEFAULT_IO_BUFFER_SIZE);
+ status = LZ4F_decompress(ctx, outbuf, &out_size,
+ readp, &read_size, &dec_opt);
+ if (LZ4F_isError(status))
+ pg_fatal("could not decompress: %s",
+ LZ4F_getErrorName(status));
- ahwrite(decbuf, 1, decBytes, AH);
+ ahwrite(outbuf, 1, out_size, AH);
+ readp += read_size;
+ }
}
- pg_free(buf);
- pg_free(decbuf);
+ pg_free(outbuf);
+ pg_free(readbuf);
+
+ status = LZ4F_freeDecompressionContext(ctx);
+ if (LZ4F_isError(status))
+ pg_fatal("could not free LZ4 decompression context: %s",
+ LZ4F_getErrorName(status));
}
static void
WriteDataToArchiveLZ4(ArchiveHandle *AH, CompressorState *cs,
const void *data, size_t dLen)
{
- LZ4CompressorState *LZ4cs = (LZ4CompressorState *) cs->private_data;
- size_t compressed;
- size_t requiredsize = LZ4_compressBound(dLen);
+ LZ4State *state = (LZ4State *) cs->private_data;
+ size_t remaining = dLen;
+ size_t status;
+ size_t chunk;
- if (requiredsize > LZ4cs->outsize)
+ /* Write the header if not yet written. */
+ if (state->needs_header_flush)
{
- LZ4cs->outbuf = pg_realloc(LZ4cs->outbuf, requiredsize);
- LZ4cs->outsize = requiredsize;
+ cs->writeF(AH, state->buffer, state->compressedlen);
+ state->needs_header_flush = false;
}
- compressed = LZ4_compress_default(data, LZ4cs->outbuf,
- dLen, LZ4cs->outsize);
+ while (remaining > 0)
+ {
+
+ if (remaining > DEFAULT_IO_BUFFER_SIZE)
+ chunk = DEFAULT_IO_BUFFER_SIZE;
+ else
+ chunk = remaining;
+
+ remaining -= chunk;
+ status = LZ4F_compressUpdate(state->ctx,
+ state->buffer, state->buflen,
+ data, chunk, NULL);
+
+ if (LZ4F_isError(status))
+ pg_fatal("failed to LZ4 compress data: %s",
+ LZ4F_getErrorName(status));
- if (compressed <= 0)
- pg_fatal("failed to LZ4 compress data");
+ cs->writeF(AH, state->buffer, status);
- cs->writeF(AH, LZ4cs->outbuf, compressed);
+ data = ((char *) data) + chunk;
+ }
}
static void
EndCompressorLZ4(ArchiveHandle *AH, CompressorState *cs)
{
- LZ4CompressorState *LZ4cs;
-
- LZ4cs = (LZ4CompressorState *) cs->private_data;
- if (LZ4cs)
- {
- pg_free(LZ4cs->outbuf);
- pg_free(LZ4cs);
- cs->private_data = NULL;
- }
+ LZ4State *state = (LZ4State *) cs->private_data;
+ size_t status;
+
+ /* Nothing needs to be done */
+ if (!state)
+ return;
+
+ /*
+ * Write the header if not yet written. The caller is not required to
+ * call writeData if the relation does not contain any data. Thus it is
+ * possible to reach here without having flushed the header. Do it before
+ * ending the compression.
+ */
+ if (state->needs_header_flush)
+ cs->writeF(AH, state->buffer, state->compressedlen);
+
+ status = LZ4F_compressEnd(state->ctx,
+ state->buffer, state->buflen,
+ NULL);
+ if (LZ4F_isError(status))
+ pg_fatal("failed to end compression: %s",
+ LZ4F_getErrorName(status));
+
+ cs->writeF(AH, state->buffer, status);
+
+ status = LZ4F_freeCompressionContext(state->ctx);
+ if (LZ4F_isError(status))
+ pg_fatal("failed to end compression: %s",
+ LZ4F_getErrorName(status));
+
+ pg_free(state->buffer);
+ pg_free(state);
+
+ cs->private_data = NULL;
}
-
/*
* Public routines that support LZ4 compressed data I/O
*/
void
InitCompressorLZ4(CompressorState *cs, const pg_compress_specification compression_spec)
{
+ LZ4State *state;
+
cs->readData = ReadDataFromArchiveLZ4;
cs->writeData = WriteDataToArchiveLZ4;
cs->end = EndCompressorLZ4;
cs->compression_spec = compression_spec;
- /* Will be lazy init'd */
- cs->private_data = pg_malloc0(sizeof(LZ4CompressorState));
+ /*
+ * Read operations have access to the whole input. No state needs
+ * to be carried between calls.
+ */
+ if (cs->readF)
+ return;
+
+ state = pg_malloc0(sizeof(*state));
+ if (cs->compression_spec.level >= 0)
+ state->prefs.compressionLevel = cs->compression_spec.level;
+
+ if (!LZ4_compression_state_init(state))
+ pg_fatal("could not initialize LZ4 compression: %s",
+ LZ4F_getErrorName(state->errcode));
+
+ /* Remember that the header has not been written. */
+ state->needs_header_flush = true;
+ cs->private_data = state;
}
/*----------------------
@@ -132,30 +308,6 @@ InitCompressorLZ4(CompressorState *cs, const pg_compress_specification compressi
*----------------------
*/
-/*
- * State needed for LZ4 (de)compression using the CompressFileHandle API.
- */
-typedef struct LZ4File
-{
- FILE *fp;
-
- LZ4F_preferences_t prefs;
-
- LZ4F_compressionContext_t ctx;
- LZ4F_decompressionContext_t dtx;
-
- bool inited;
- bool compressing;
-
- size_t buflen;
- char *buffer;
-
- size_t overflowalloclen;
- size_t overflowlen;
- char *overflowbuf;
-
- size_t errcode;
-} LZ4File;
/*
* LZ4 equivalent to feof() or gzeof(). Return true iff there is no
@@ -165,7 +317,7 @@ typedef struct LZ4File
static bool
LZ4File_eof(CompressFileHandle *CFH)
{
- LZ4File *fs = (LZ4File *) CFH->private_data;
+ LZ4State *fs = (LZ4State *) CFH->private_data;
return fs->overflowlen == 0 && feof(fs->fp);
}
@@ -173,7 +325,7 @@ LZ4File_eof(CompressFileHandle *CFH)
static const char *
LZ4File_get_error(CompressFileHandle *CFH)
{
- LZ4File *fs = (LZ4File *) CFH->private_data;
+ LZ4State *fs = (LZ4State *) CFH->private_data;
const char *errmsg;
if (LZ4F_isError(fs->errcode))
@@ -185,7 +337,7 @@ LZ4File_get_error(CompressFileHandle *CFH)
}
/*
- * Prepare an already alloc'ed LZ4File struct for subsequent calls (either
+ * Prepare an already alloc'ed LZ4State struct for subsequent calls (either
* compression or decompression).
*
* It creates the necessary contexts for the operations. When compressing data
@@ -196,7 +348,7 @@ LZ4File_get_error(CompressFileHandle *CFH)
* error code in fs->errcode.
*/
static bool
-LZ4File_init(LZ4File *fs, int size, bool compressing)
+LZ4File_init(LZ4State *fs, int size, bool compressing)
{
size_t status;
@@ -209,33 +361,11 @@ LZ4File_init(LZ4File *fs, int size, bool compressing)
/* When compressing, write LZ4 header to the output stream. */
if (fs->compressing)
{
- fs->buflen = LZ4F_compressBound(DEFAULT_IO_BUFFER_SIZE, &fs->prefs);
- /*
- * LZ4F_compressBegin requires a buffer that is greater or equal to
- * LZ4F_HEADER_SIZE_MAX. Verify that the requirement is met.
- */
- if (fs->buflen < LZ4F_HEADER_SIZE_MAX)
- fs->buflen = LZ4F_HEADER_SIZE_MAX;
-
- status = LZ4F_createCompressionContext(&fs->ctx, LZ4F_VERSION);
- if (LZ4F_isError(status))
- {
- fs->errcode = status;
- return false;
- }
-
- fs->buffer = pg_malloc(fs->buflen);
- status = LZ4F_compressBegin(fs->ctx, fs->buffer, fs->buflen,
- &fs->prefs);
-
- if (LZ4F_isError(status))
- {
- fs->errcode = status;
+ if (!LZ4_compression_state_init(fs))
return false;
- }
- if (fwrite(fs->buffer, 1, status, fs->fp) != status)
+ if (fwrite(fs->buffer, 1, fs->compressedlen, fs->fp) != fs->compressedlen)
{
errno = (errno) ? errno : ENOSPC;
return false;
@@ -272,7 +402,7 @@ LZ4File_init(LZ4File *fs, int size, bool compressing)
* the 'ptr' buffer), or 0 if the overflow buffer is empty.
*/
static int
-LZ4File_read_overflow(LZ4File *fs, void *ptr, int size, bool eol_flag)
+LZ4File_read_overflow(LZ4State *fs, void *ptr, int size, bool eol_flag)
{
char *p;
int readlen = 0;
@@ -306,7 +436,7 @@ LZ4File_read_overflow(LZ4File *fs, void *ptr, int size, bool eol_flag)
* char if found first when the eol_flag is set. It is possible that the
* decompressed output generated by reading any compressed input via the
* LZ4F API, exceeds 'ptrsize'. Any exceeding decompressed content is stored
- * at an overflow buffer within LZ4File. Of course, when the function is
+ * at an overflow buffer within LZ4State. Of course, when the function is
* called, it will first try to consume any decompressed content already
* present in the overflow buffer, before decompressing new content.
*
@@ -314,7 +444,7 @@ LZ4File_read_overflow(LZ4File *fs, void *ptr, int size, bool eol_flag)
* buffer, or -1 in case of error.
*/
static int
-LZ4File_read_internal(LZ4File *fs, void *ptr, int ptrsize, bool eol_flag)
+LZ4File_read_internal(LZ4State *fs, void *ptr, int ptrsize, bool eol_flag)
{
int dsize = 0;
int rsize;
@@ -425,7 +555,7 @@ LZ4File_read_internal(LZ4File *fs, void *ptr, int ptrsize, bool eol_flag)
static bool
LZ4File_write(const void *ptr, size_t size, CompressFileHandle *CFH)
{
- LZ4File *fs = (LZ4File *) CFH->private_data;
+ LZ4State *fs = (LZ4State *) CFH->private_data;
size_t status;
int remaining = size;
@@ -463,7 +593,7 @@ LZ4File_write(const void *ptr, size_t size, CompressFileHandle *CFH)
static bool
LZ4File_read(void *ptr, size_t size, size_t *rsize, CompressFileHandle *CFH)
{
- LZ4File *fs = (LZ4File *) CFH->private_data;
+ LZ4State *fs = (LZ4State *) CFH->private_data;
int ret;
if ((ret = LZ4File_read_internal(fs, ptr, size, false)) < 0)
@@ -481,7 +611,7 @@ LZ4File_read(void *ptr, size_t size, size_t *rsize, CompressFileHandle *CFH)
static int
LZ4File_getc(CompressFileHandle *CFH)
{
- LZ4File *fs = (LZ4File *) CFH->private_data;
+ LZ4State *fs = (LZ4State *) CFH->private_data;
unsigned char c;
if (LZ4File_read_internal(fs, &c, 1, false) <= 0)
@@ -501,7 +631,7 @@ LZ4File_getc(CompressFileHandle *CFH)
static char *
LZ4File_gets(char *ptr, int size, CompressFileHandle *CFH)
{
- LZ4File *fs = (LZ4File *) CFH->private_data;
+ LZ4State *fs = (LZ4State *) CFH->private_data;
int ret;
ret = LZ4File_read_internal(fs, ptr, size, true);
@@ -523,7 +653,7 @@ static bool
LZ4File_close(CompressFileHandle *CFH)
{
FILE *fp;
- LZ4File *fs = (LZ4File *) CFH->private_data;
+ LZ4State *fs = (LZ4State *) CFH->private_data;
size_t status;
fp = fs->fp;
@@ -568,7 +698,7 @@ LZ4File_open(const char *path, int fd, const char *mode,
CompressFileHandle *CFH)
{
FILE *fp;
- LZ4File *lz4fp = (LZ4File *) CFH->private_data;
+ LZ4State *lz4fp = (LZ4State *) CFH->private_data;
if (fd >= 0)
fp = fdopen(fd, mode);
@@ -609,7 +739,7 @@ void
InitCompressFileHandleLZ4(CompressFileHandle *CFH,
const pg_compress_specification compression_spec)
{
- LZ4File *lz4fp;
+ LZ4State *lz4fp;
CFH->open_func = LZ4File_open;
CFH->open_write_func = LZ4File_open_write;
--
2.34.1