This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new 24e89846 perf(parquet/internal/encoding): vectorize amd64 bool unpack 
(#735)
24e89846 is described below

commit 24e89846346b7f667db94ecba41caece4e49e26b
Author: Matt Topol <[email protected]>
AuthorDate: Sun Mar 29 13:54:21 2026 -0400

    perf(parquet/internal/encoding): vectorize amd64 bool unpack (#735)
    
    ### Rationale for this change
    The SSE4 and AVX2 implementations of _bytes_to_bools in
    parquet/internal/utils/ contain zero SIMD instructions. They completely
    failed to auto-vectorize the C loop, producing purely scalar code
    (movzx/shr/and/mov one bit at a time). The SSE4 and AVX2 .s files are
    byte-for-byte identical — just scalar code with different labels.
    
    This is the amd64 counterpart to #731 which fixed the same issue on
    ARM64 NEON.
    
    ### What changes are included in this PR?
    
    Rewrote both assembly implementations with actual SIMD vectorized code.
    SSE4 (unpack_bool_sse4_amd64.s) — processes 2 input bytes → 16 output
    bools per iteration:
    
    1. MOVWLZX + MOVD — load 2 input bytes into XMM
    2. PSHUFB — broadcast byte 0 → lanes 0-7, byte 1 → lanes 8-15
    3. PAND + PCMPEQB — parallel bit-test against mask
    [1,2,4,8,16,32,64,128] × 2
    4. PAND — normalize 0xFF → 0x01 for valid Go bool values
    5. MOVOU — store 16 output bools at once
    
    AVX2 (unpack_bool_avx2_amd64.s) — processes 4 input bytes → 32 output
    bools per iteration:
    
    1. MOVL + MOVD + VPBROADCASTD — load and broadcast 4 bytes across all 32
    YMM lanes
    2. VPSHUFB — distribute each byte to its 8 corresponding lanes
    3. VPAND + VPCMPEQB + VPAND — parallel bit-test + normalize to 0/1
    4. VMOVDQU — store 32 output bools at once
    5. VZEROUPPER — avoid SSE-AVX transition penalties on return
    
    Both include scalar tails for when fewer than vector-width output slots
    remain.
    
    ### Are these changes tested?
    
    All existing tests continue to pass, new tests added to further
    validate:
    
    - TestBytesToBoolsCorrectness — validates every bit position against the
    reference Go implementation for sizes 1–1024 bytes
    - TestBytesToBoolsOutlenSmaller — edge case where output is smaller than
    8× input
    - BenchmarkBytesToBools — parametric benchmark at 64B, 256B, 1KB, 4KB,
    16KB
    
    
    ### Are there any user-facing changes?
    No, this is purely a performance optimization:
    
    *Benchmark Results (AMD Ryzen 7 7800X3D, linux/amd64)*
    
    ```
                                   baseline (scalar)   optimized (AVX2)
                                       sec/op              sec/op       vs base
    BytesToBools/bytes=64-16           146.0n              15.60n     -89.32% 
(p=0.008)
    BytesToBools/bytes=256-16          562.3n              63.36n     -88.73% 
(p=0.008)
    BytesToBools/bytes=1K-16           2.247µ              253.9n     -88.70% 
(p=0.008)
    BytesToBools/bytes=4K-16           8.970µ              1.018µ     -88.65% 
(p=0.008)
    BytesToBools/bytes=16K-16         35.798µ              4.044µ     -88.70% 
(p=0.008)
    geomean                            2.262µ              252.8n     -88.82%
    ```
    
    Throughput: 432 MiB/s → 3,853 MiB/s (+795%)
    Zero allocations in both versions. All results statistically
    significant.
---
 parquet/internal/utils/unpack_bool.go              |   6 +-
 parquet/internal/utils/unpack_bool_avx2_amd64.s    | 201 ++++++++++++---------
 .../internal/utils/unpack_bool_benchmark_test.go   | 105 +++++++++++
 parquet/internal/utils/unpack_bool_sse4_amd64.s    | 190 ++++++++++---------
 4 files changed, 339 insertions(+), 163 deletions(-)

diff --git a/parquet/internal/utils/unpack_bool.go 
b/parquet/internal/utils/unpack_bool.go
index 3ccb0b7b..07e61a4e 100644
--- a/parquet/internal/utils/unpack_bool.go
+++ b/parquet/internal/utils/unpack_bool.go
@@ -20,7 +20,11 @@ package utils
 func bytesToBoolsGo(in []byte, out []bool) {
        for i, b := range in {
                for j := 0; j < 8; j++ {
-                       out[8*i+j] = (b & (1 << j)) != 0
+                       idx := 8*i + j
+                       if idx >= len(out) {
+                               return
+                       }
+                       out[idx] = (b & (1 << j)) != 0
                }
        }
 }
diff --git a/parquet/internal/utils/unpack_bool_avx2_amd64.s 
b/parquet/internal/utils/unpack_bool_avx2_amd64.s
index 459ff786..6079fd11 100644
--- a/parquet/internal/utils/unpack_bool_avx2_amd64.s
+++ b/parquet/internal/utils/unpack_bool_avx2_amd64.s
@@ -1,88 +1,127 @@
 //+build !noasm !appengine
-// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
 
-TEXT ·_bytes_to_bools_avx2(SB), $0-32
+// AVX2 vectorized bytes-to-bools using VPBROADCASTD + VPSHUFB + VPCMPEQB.
+// Processes 4 input bytes → 32 output bools per vector iteration.
+// Replaces the original c2goasm-generated scalar code which used zero SIMD.
+
+#include "textflag.h"
+
+// VPSHUFB operates on two independent 128-bit lanes.
+// Lower lane: byte 0 → lanes 0-7, byte 1 → lanes 8-15
+// Upper lane: byte 2 → lanes 16-23, byte 3 → lanes 24-31
+DATA shuffle_avx2<>+0x00(SB)/8, $0x0000000000000000
+DATA shuffle_avx2<>+0x08(SB)/8, $0x0101010101010101
+DATA shuffle_avx2<>+0x10(SB)/8, $0x0202020202020202
+DATA shuffle_avx2<>+0x18(SB)/8, $0x0303030303030303
+GLOBL shuffle_avx2<>(SB), (NOPTR+RODATA), $32
+
+// [1, 2, 4, 8, 16, 32, 64, 128] × 4
+DATA bitmask_avx2<>+0x00(SB)/8, $0x8040201008040201
+DATA bitmask_avx2<>+0x08(SB)/8, $0x8040201008040201
+DATA bitmask_avx2<>+0x10(SB)/8, $0x8040201008040201
+DATA bitmask_avx2<>+0x18(SB)/8, $0x8040201008040201
+GLOBL bitmask_avx2<>(SB), (NOPTR+RODATA), $32
+
+// [1, 1, 1, ...] × 32
+DATA ones_avx2<>+0x00(SB)/8, $0x0101010101010101
+DATA ones_avx2<>+0x08(SB)/8, $0x0101010101010101
+DATA ones_avx2<>+0x10(SB)/8, $0x0101010101010101
+DATA ones_avx2<>+0x18(SB)/8, $0x0101010101010101
+GLOBL ones_avx2<>(SB), (NOPTR+RODATA), $32
+
+TEXT ·_bytes_to_bools_avx2(SB), NOSPLIT, $0-32
 
        MOVQ in+0(FP), DI
        MOVQ len+8(FP), SI
        MOVQ out+16(FP), DX
-       MOVQ outlen+24(FP), CX
-
-       WORD $0xf685             // test    esi, esi
-       JLE  LBB0_5
-       WORD $0x8941; BYTE $0xf0 // mov    r8d, esi
-       LONG $0x03e0c149         // shl    r8, 3
-       WORD $0x3145; BYTE $0xd2 // xor    r10d, r10d
-       JMP  LBB0_2
-
-LBB0_4:
-       LONG $0x08c28349         // add    r10, 8
-       LONG $0x01c78348         // add    rdi, 1
-       WORD $0x394d; BYTE $0xd0 // cmp    r8, r10
-       JE   LBB0_5
-
-LBB0_2:
-       WORD $0x3941; BYTE $0xca // cmp    r10d, ecx
-       JGE  LBB0_4
-       WORD $0x8945; BYTE $0xd1 // mov    r9d, r10d
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0x0124             // and    al, 1
-       LONG $0x0a048842         // mov    byte [rdx + r9], al
-       WORD $0x894c; BYTE $0xce // mov    rsi, r9
-       LONG $0x01ce8348         // or    rsi, 1
-       WORD $0xce39             // cmp    esi, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8d0             // shr    al, 1
-       WORD $0x0124             // and    al, 1
-       WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
-       WORD $0x894c; BYTE $0xce // mov    rsi, r9
-       LONG $0x02ce8348         // or    rsi, 2
-       WORD $0xce39             // cmp    esi, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8c0; BYTE $0x02 // shr    al, 2
-       WORD $0x0124             // and    al, 1
-       WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
-       WORD $0x894c; BYTE $0xce // mov    rsi, r9
-       LONG $0x03ce8348         // or    rsi, 3
-       WORD $0xce39             // cmp    esi, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8c0; BYTE $0x03 // shr    al, 3
-       WORD $0x0124             // and    al, 1
-       WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
-       WORD $0x894c; BYTE $0xce // mov    rsi, r9
-       LONG $0x04ce8348         // or    rsi, 4
-       WORD $0xce39             // cmp    esi, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8c0; BYTE $0x04 // shr    al, 4
-       WORD $0x0124             // and    al, 1
-       WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
-       WORD $0x894c; BYTE $0xce // mov    rsi, r9
-       LONG $0x05ce8348         // or    rsi, 5
-       WORD $0xce39             // cmp    esi, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8c0; BYTE $0x05 // shr    al, 5
-       WORD $0x0124             // and    al, 1
-       WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
-       WORD $0x894c; BYTE $0xce // mov    rsi, r9
-       LONG $0x06ce8348         // or    rsi, 6
-       WORD $0xce39             // cmp    esi, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8c0; BYTE $0x06 // shr    al, 6
-       WORD $0x0124             // and    al, 1
-       WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
-       LONG $0x07c98349         // or    r9, 7
-       WORD $0x3941; BYTE $0xc9 // cmp    r9d, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8c0; BYTE $0x07 // shr    al, 7
-       LONG $0x0a048842         // mov    byte [rdx + r9], al
-       JMP  LBB0_4
-
-LBB0_5:
+       MOVQ outlen+24(FP), R13
+
+       TESTL SI, SI
+       JLE  done
+
+       VMOVDQU shuffle_avx2<>(SB), Y3
+       VMOVDQU bitmask_avx2<>(SB), Y4
+       VMOVDQU ones_avx2<>(SB), Y5
+
+       XORQ R8, R8
+       XORQ R9, R9
+
+loop32:
+       MOVQ SI, AX
+       SUBQ R8, AX
+       CMPQ AX, $4
+       JL   loop8
+
+       MOVQ R13, AX
+       SUBQ R9, AX
+       CMPQ AX, $32
+       JL   loop8
+
+       MOVL (DI)(R8*1), AX
+       MOVD AX, X0
+       VPBROADCASTD X0, Y0
+       VPSHUFB Y3, Y0, Y0
+       VPAND Y4, Y0, Y1
+       VPCMPEQB Y4, Y1, Y1
+       VPAND Y5, Y1, Y1
+       VMOVDQU Y1, (DX)(R9*1)
+
+       ADDQ $4, R8
+       ADDQ $32, R9
+       JMP  loop32
+
+loop8:
+       CMPQ R8, SI
+       JGE  avx_done
+
+       MOVQ R13, AX
+       SUBQ R9, AX
+       CMPQ AX, $8
+       JL   scalar
+
+       MOVBLZX (DI)(R8*1), AX
+       MOVD AX, X0
+       VPBROADCASTD X0, Y0
+       VPSHUFB Y3, Y0, Y0
+       VPAND Y4, Y0, Y1
+       VPCMPEQB Y4, Y1, Y1
+       VPAND Y5, Y1, Y1
+       MOVQ X1, (DX)(R9*1)
+
+       ADDQ $1, R8
+       ADDQ $8, R9
+       JMP  loop8
+
+scalar:
+       CMPQ R8, SI
+       JGE  avx_done
+       CMPQ R9, R13
+       JGE  avx_done
+
+       MOVBLZX (DI)(R8*1), AX
+       XORQ CX, CX
+
+scalar_bit:
+       CMPQ CX, $8
+       JGE  scalar_next
+       CMPQ R9, R13
+       JGE  avx_done
+
+       MOVL AX, R11
+       SHRL CL, R11
+       ANDL $1, R11
+       MOVB R11, (DX)(R9*1)
+
+       INCQ CX
+       INCQ R9
+       JMP  scalar_bit
+
+scalar_next:
+       INCQ R8
+       JMP  scalar
+
+avx_done:
+       VZEROUPPER
+
+done:
        RET
diff --git a/parquet/internal/utils/unpack_bool_benchmark_test.go 
b/parquet/internal/utils/unpack_bool_benchmark_test.go
new file mode 100644
index 00000000..e5d18b94
--- /dev/null
+++ b/parquet/internal/utils/unpack_bool_benchmark_test.go
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils_test
+
+import (
+       "fmt"
+       "math/rand"
+       "testing"
+
+       "github.com/apache/arrow-go/v18/parquet/internal/utils"
+)
+
+func BenchmarkBytesToBools(b *testing.B) {
+       for _, nBytes := range []int{64, 256, 1024, 4096, 16384} {
+               in := make([]byte, nBytes)
+               rng := rand.New(rand.NewSource(42))
+               for i := range in {
+                       in[i] = byte(rng.Intn(256))
+               }
+               out := make([]bool, nBytes*8)
+
+               b.Run(fmt.Sprintf("bytes=%d", nBytes), func(b *testing.B) {
+                       b.SetBytes(int64(nBytes))
+                       for i := 0; i < b.N; i++ {
+                               utils.BytesToBools(in, out)
+                       }
+               })
+       }
+}
+
+func TestBytesToBoolsCorrectness(t *testing.T) {
+       rng := rand.New(rand.NewSource(12345))
+
+       for _, nBytes := range []int{1, 2, 3, 7, 8, 15, 16, 31, 32, 63, 64, 
100, 256, 1024} {
+               t.Run(fmt.Sprintf("bytes=%d", nBytes), func(t *testing.T) {
+                       in := make([]byte, nBytes)
+                       for i := range in {
+                               in[i] = byte(rng.Intn(256))
+                       }
+
+                       outlen := nBytes * 8
+                       got := make([]bool, outlen)
+                       want := make([]bool, outlen)
+
+                       for i, b := range in {
+                               for j := 0; j < 8; j++ {
+                                       want[8*i+j] = (b & (1 << j)) != 0
+                               }
+                       }
+
+                       utils.BytesToBools(in, got)
+
+                       for i := 0; i < outlen; i++ {
+                               if got[i] != want[i] {
+                                       byteIdx := i / 8
+                                       bitIdx := i % 8
+                                       t.Fatalf("mismatch at index %d (byte 
%d, bit %d): got %v, want %v (input byte = 0x%02x)",
+                                               i, byteIdx, bitIdx, got[i], 
want[i], in[byteIdx])
+                               }
+                       }
+               })
+       }
+}
+
+func TestBytesToBoolsOutlenSmaller(t *testing.T) {
+       in := []byte{0xFF, 0xAA, 0x55}
+       for outlen := 1; outlen <= 24; outlen++ {
+               t.Run(fmt.Sprintf("outlen=%d", outlen), func(t *testing.T) {
+                       got := make([]bool, outlen)
+                       want := make([]bool, outlen)
+
+                       for i, b := range in {
+                               for j := 0; j < 8; j++ {
+                                       idx := 8*i + j
+                                       if idx >= outlen {
+                                               break
+                                       }
+                                       want[idx] = (b & (1 << j)) != 0
+                               }
+                       }
+
+                       utils.BytesToBools(in, got)
+
+                       for i := 0; i < outlen; i++ {
+                               if got[i] != want[i] {
+                                       t.Fatalf("outlen=%d: mismatch at index 
%d: got %v, want %v", outlen, i, got[i], want[i])
+                               }
+                       }
+               })
+       }
+}
diff --git a/parquet/internal/utils/unpack_bool_sse4_amd64.s 
b/parquet/internal/utils/unpack_bool_sse4_amd64.s
index ac8acb9f..a8d9fc88 100644
--- a/parquet/internal/utils/unpack_bool_sse4_amd64.s
+++ b/parquet/internal/utils/unpack_bool_sse4_amd64.s
@@ -1,88 +1,116 @@
 //+build !noasm !appengine
-// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
 
-TEXT ·_bytes_to_bools_sse4(SB), $0-32
+// SSE4 vectorized bytes-to-bools using PSHUFB broadcast + PCMPEQB bit-test.
+// Processes 2 input bytes → 16 output bools per vector iteration.
+// Replaces the original c2goasm-generated scalar code which used zero SIMD.
+
+#include "textflag.h"
+
+// broadcast byte 0 → lanes 0-7, byte 1 → lanes 8-15
+DATA shuffle_sse4<>+0x00(SB)/8, $0x0000000000000000
+DATA shuffle_sse4<>+0x08(SB)/8, $0x0101010101010101
+GLOBL shuffle_sse4<>(SB), (NOPTR+RODATA), $16
+
+// [1, 2, 4, 8, 16, 32, 64, 128] × 2
+DATA bitmask_sse4<>+0x00(SB)/8, $0x8040201008040201
+DATA bitmask_sse4<>+0x08(SB)/8, $0x8040201008040201
+GLOBL bitmask_sse4<>(SB), (NOPTR+RODATA), $16
+
+// [1, 1, 1, ...] × 16
+DATA ones_sse4<>+0x00(SB)/8, $0x0101010101010101
+DATA ones_sse4<>+0x08(SB)/8, $0x0101010101010101
+GLOBL ones_sse4<>(SB), (NOPTR+RODATA), $16
+
+TEXT ·_bytes_to_bools_sse4(SB), NOSPLIT, $0-32
 
        MOVQ in+0(FP), DI
        MOVQ len+8(FP), SI
        MOVQ out+16(FP), DX
-       MOVQ outlen+24(FP), CX
-
-       WORD $0xf685             // test    esi, esi
-       JLE  LBB0_5
-       WORD $0x8941; BYTE $0xf0 // mov    r8d, esi
-       LONG $0x03e0c149         // shl    r8, 3
-       WORD $0x3145; BYTE $0xd2 // xor    r10d, r10d
-       JMP  LBB0_2
-
-LBB0_4:
-       LONG $0x08c28349         // add    r10, 8
-       LONG $0x01c78348         // add    rdi, 1
-       WORD $0x394d; BYTE $0xd0 // cmp    r8, r10
-       JE   LBB0_5
-
-LBB0_2:
-       WORD $0x3941; BYTE $0xca // cmp    r10d, ecx
-       JGE  LBB0_4
-       WORD $0x8945; BYTE $0xd1 // mov    r9d, r10d
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0x0124             // and    al, 1
-       LONG $0x0a048842         // mov    byte [rdx + r9], al
-       WORD $0x894c; BYTE $0xce // mov    rsi, r9
-       LONG $0x01ce8348         // or    rsi, 1
-       WORD $0xce39             // cmp    esi, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8d0             // shr    al, 1
-       WORD $0x0124             // and    al, 1
-       WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
-       WORD $0x894c; BYTE $0xce // mov    rsi, r9
-       LONG $0x02ce8348         // or    rsi, 2
-       WORD $0xce39             // cmp    esi, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8c0; BYTE $0x02 // shr    al, 2
-       WORD $0x0124             // and    al, 1
-       WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
-       WORD $0x894c; BYTE $0xce // mov    rsi, r9
-       LONG $0x03ce8348         // or    rsi, 3
-       WORD $0xce39             // cmp    esi, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8c0; BYTE $0x03 // shr    al, 3
-       WORD $0x0124             // and    al, 1
-       WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
-       WORD $0x894c; BYTE $0xce // mov    rsi, r9
-       LONG $0x04ce8348         // or    rsi, 4
-       WORD $0xce39             // cmp    esi, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8c0; BYTE $0x04 // shr    al, 4
-       WORD $0x0124             // and    al, 1
-       WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
-       WORD $0x894c; BYTE $0xce // mov    rsi, r9
-       LONG $0x05ce8348         // or    rsi, 5
-       WORD $0xce39             // cmp    esi, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8c0; BYTE $0x05 // shr    al, 5
-       WORD $0x0124             // and    al, 1
-       WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
-       WORD $0x894c; BYTE $0xce // mov    rsi, r9
-       LONG $0x06ce8348         // or    rsi, 6
-       WORD $0xce39             // cmp    esi, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8c0; BYTE $0x06 // shr    al, 6
-       WORD $0x0124             // and    al, 1
-       WORD $0x0488; BYTE $0x32 // mov    byte [rdx + rsi], al
-       LONG $0x07c98349         // or    r9, 7
-       WORD $0x3941; BYTE $0xc9 // cmp    r9d, ecx
-       JGE  LBB0_4
-       WORD $0xb60f; BYTE $0x07 // movzx    eax, byte [rdi]
-       WORD $0xe8c0; BYTE $0x07 // shr    al, 7
-       LONG $0x0a048842         // mov    byte [rdx + r9], al
-       JMP  LBB0_4
-
-LBB0_5:
+       MOVQ outlen+24(FP), R13
+
+       TESTL SI, SI
+       JLE  done
+
+       MOVOU shuffle_sse4<>(SB), X3
+       MOVOU bitmask_sse4<>(SB), X4
+       MOVOU ones_sse4<>(SB), X5
+
+       XORQ R8, R8
+       XORQ R9, R9
+
+loop16:
+       MOVQ SI, AX
+       SUBQ R8, AX
+       CMPQ AX, $2
+       JL   loop8
+
+       MOVQ R13, AX
+       SUBQ R9, AX
+       CMPQ AX, $16
+       JL   loop8
+
+       MOVWLZX (DI)(R8*1), AX
+       MOVD AX, X0
+       PSHUFB X3, X0
+       MOVOU X0, X1
+       PAND  X4, X1
+       PCMPEQB X4, X1
+       PAND X5, X1
+       MOVOU X1, (DX)(R9*1)
+
+       ADDQ $2, R8
+       ADDQ $16, R9
+       JMP  loop16
+
+loop8:
+       CMPQ R8, SI
+       JGE  done
+
+       MOVQ R13, AX
+       SUBQ R9, AX
+       CMPQ AX, $8
+       JL   scalar
+
+       MOVBLZX (DI)(R8*1), AX
+       MOVD AX, X0
+       PXOR X6, X6
+       PSHUFB X6, X0
+       PAND  X4, X0
+       PCMPEQB X4, X0
+       PAND  X5, X0
+       MOVQ X0, (DX)(R9*1)
+
+       ADDQ $1, R8
+       ADDQ $8, R9
+       JMP  loop8
+
+scalar:
+       CMPQ R8, SI
+       JGE  done
+       CMPQ R9, R13
+       JGE  done
+
+       MOVBLZX (DI)(R8*1), AX
+       XORQ CX, CX
+
+scalar_bit:
+       CMPQ CX, $8
+       JGE  scalar_next
+       CMPQ R9, R13
+       JGE  done
+
+       MOVL AX, R11
+       SHRL CL, R11
+       ANDL $1, R11
+       MOVB R11, (DX)(R9*1)
+
+       INCQ CX
+       INCQ R9
+       JMP  scalar_bit
+
+scalar_next:
+       INCQ R8
+       JMP  scalar
+
+done:
        RET

Reply via email to