This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new abb91cf9 perf(parquet): eliminate per-value allocation in delta 
bit-pack decoder (#730)
abb91cf9 is described below

commit abb91cf9aedde397a21f810680eb57627af225b3
Author: Matt Topol <[email protected]>
AuthorDate: Sun Mar 29 13:54:35 2026 -0400

    perf(parquet): eliminate per-value allocation in delta bit-pack decoder 
(#730)
    
    ### Rationale for this change
    
    The delta bit-pack decoder's `unpackNextMini()` method calls
    `BitReader.GetValue()` once per value in each miniblock. `GetValue()`
    allocates a fresh `[]uint64` slice on every call.
    
    For the default block size of 128 with 4 miniblocks of 32 values each,
    this causes ~128 heap allocations per block, or ~2048 allocations per
    1024-value page. This allocation pressure dominates the decoder's
    runtime and generates significant GC load.
    
    ### What changes are included in this PR?
    
    1. **Reused buffer**: Added a `deltaBuf []uint64` field to the decoder
    struct that is allocated once and reused across calls, eliminating the
    per-value allocations.
    2. **Width=0 fast-path**: When `deltaBitWidth == 0` (all deltas are
    identical, common for sequential or constant data), skip the bit reading
    entirely and directly accumulate `minDelta`
    
    ### Are these changes tested?
    Yes, the existing test suite passes along with all encoding and property
    tests
    
    ### Are there any user-facing changes?
    No user-facing API changes, purely an internal optimization
    
    *Benchmark Results (darwin/arm64, Apple M4, Go 1.25)*
    
    **Baseline (main):**
    ```
    goos: darwin
    goarch: arm64
    pkg: github.com/apache/arrow-go/v18/parquet/internal/encoding
    cpu: Apple M4
    BenchmarkDeltaBinaryPackedDecodingInt32/len_1024-10     101904   11393 
ns/op    359.52 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_1024-10     107488   11192 
ns/op    365.97 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_1024-10     107224   11338 
ns/op    361.26 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_1024-10     105836   11338 
ns/op    361.26 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_1024-10     105051   11167 
ns/op    366.78 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_65536-10      1846  653185 
ns/op    401.33 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_65536-10      1914  631663 
ns/op    415.01 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_65536-10      1912  630612 
ns/op    415.70 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_65536-10      1916  633029 
ns/op    414.11 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_65536-10      1939  640181 
ns/op    409.48 MB/s   4912 B/op   2 allocs/op
    ```
    
    **Optimized (this PR):**
    ```
    goos: darwin
    goarch: arm64
    pkg: github.com/apache/arrow-go/v18/parquet/internal/encoding
    cpu: Apple M4
    BenchmarkDeltaBinaryPackedDecodingInt32/len_1024-10     435112    2768 
ns/op   1479.84 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_1024-10     434569    2768 
ns/op   1479.95 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_1024-10     434536    2772 
ns/op   1477.51 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_1024-10     411572    2859 
ns/op   1432.55 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_1024-10     419126    2862 
ns/op   1431.03 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_65536-10      8756  136708 
ns/op   1917.55 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_65536-10      8850  136880 
ns/op   1915.14 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_65536-10      8913  136911 
ns/op   1914.70 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_65536-10      8158  145797 
ns/op   1798.01 MB/s   4912 B/op   2 allocs/op
    BenchmarkDeltaBinaryPackedDecodingInt32/len_65536-10      8281  142564 
ns/op   1838.79 MB/s   4912 B/op   2 allocs/op
    ```
    
    **Summary:**
    
    | Size | Baseline (ns/op) | Optimized (ns/op) | Speedup | Baseline
    (MB/s) | Optimized (MB/s) |
    
    
|------|-------------------|-------------------|---------|-----------------|------------------|
    | 1024 | 11,286 | 2,806 | **4.0x** | 363 | 1,460 |
    | 65536 | 637,734 | 139,772 | **4.6x** | 411 | 1,877 |
    
    * ~4-4.6x faster decoding
    * Zero additional allocations (2 allocs/op unchanged — those are from
    test setup)
    * Encoding performance is unchanged (encoder path not modified)
    
    Co-authored-by: Matt Topol <[email protected]>
---
 parquet/internal/encoding/delta_bit_packing.go | 32 +++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/parquet/internal/encoding/delta_bit_packing.go 
b/parquet/internal/encoding/delta_bit_packing.go
index 9f43fc88..b74c27d0 100644
--- a/parquet/internal/encoding/delta_bit_packing.go
+++ b/parquet/internal/encoding/delta_bit_packing.go
@@ -54,6 +54,7 @@ type deltaBitPackDecoder[T int32 | int64] struct {
        lastVal     int64
 
        miniBlockValues []T
+       deltaBuf        []uint64
 }
 
 // returns the number of bytes read so far
@@ -138,14 +139,33 @@ func (d *deltaBitPackDecoder[T]) unpackNextMini() error {
        d.deltaBitWidth = d.deltaBitWidths.Bytes()[int(d.miniBlockIdx)]
        d.currentMiniBlockVals = d.valsPerMini
 
-       for j := 0; j < int(d.valsPerMini); j++ {
-               delta, ok := d.bitdecoder.GetValue(int(d.deltaBitWidth))
-               if !ok {
-                       return errors.New("parquet: eof exception")
+       n := int(d.valsPerMini)
+       width := uint(d.deltaBitWidth)
+
+       if width == 0 {
+               for j := 0; j < n; j++ {
+                       d.lastVal += int64(d.minDelta)
+                       d.miniBlockValues = append(d.miniBlockValues, 
T(d.lastVal))
+               }
+       } else {
+               if cap(d.deltaBuf) < 1 {
+                       d.deltaBuf = make([]uint64, 1)
                }
+               d.deltaBuf = d.deltaBuf[:1]
+               minDelta := d.minDelta
+
+               for j := 0; j < n; j++ {
+                       nread, err := d.bitdecoder.GetBatch(width, d.deltaBuf)
+                       if err != nil {
+                               return err
+                       }
+                       if nread != 1 {
+                               return errors.New("parquet: eof exception")
+                       }
 
-               d.lastVal += int64(delta) + int64(d.minDelta)
-               d.miniBlockValues = append(d.miniBlockValues, T(d.lastVal))
+                       d.lastVal += int64(d.deltaBuf[0]) + minDelta
+                       d.miniBlockValues = append(d.miniBlockValues, 
T(d.lastVal))
+               }
        }
        d.miniBlockIdx++
        return nil

Reply via email to