This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new 24e89846 perf(parquet/internal/encoding): vectorize amd64 bool unpack
(#735)
24e89846 is described below
commit 24e89846346b7f667db94ecba41caece4e49e26b
Author: Matt Topol <[email protected]>
AuthorDate: Sun Mar 29 13:54:21 2026 -0400
perf(parquet/internal/encoding): vectorize amd64 bool unpack (#735)
### Rationale for this change
The SSE4 and AVX2 implementations of _bytes_to_bools in
parquet/internal/utils/ contain zero SIMD instructions. They completely
failed to auto-vectorize the C loop, producing purely scalar code
(movzx/shr/and/mov one bit at a time). The SSE4 and AVX2 .s files are
byte-for-byte identical — just scalar code with different labels.
This is the amd64 counterpart to #731 which fixed the same issue on
ARM64 NEON.
### What changes are included in this PR?
Rewrote both assembly implementations with actual SIMD vectorized code.
SSE4 (unpack_bool_sse4_amd64.s) — processes 2 input bytes → 16 output
bools per iteration:
1. MOVWLZX + MOVD — load 2 input bytes into XMM
2. PSHUFB — broadcast byte 0 → lanes 0-7, byte 1 → lanes 8-15
3. PAND + PCMPEQB — parallel bit-test against mask
[1,2,4,8,16,32,64,128] × 2
4. PAND — normalize 0xFF → 0x01 for valid Go bool values
5. MOVOU — store 16 output bools at once
AVX2 (unpack_bool_avx2_amd64.s) — processes 4 input bytes → 32 output
bools per iteration:
1. MOVL + MOVD + VPBROADCASTD — load and broadcast 4 bytes across all 32
YMM lanes
2. VPSHUFB — distribute each byte to its 8 corresponding lanes
3. VPAND + VPCMPEQB + VPAND — parallel bit-test + normalize to 0/1
4. VMOVDQU — store 32 output bools at once
5. VZEROUPPER — avoid SSE-AVX transition penalties on return
Both include scalar tails for when fewer than vector-width output slots
remain.
### Are these changes tested?
All existing tests continue to pass, new tests added to further
validate:
- TestBytesToBoolsCorrectness — validates every bit position against the
reference Go implementation for sizes 1–1024 bytes
- TestBytesToBoolsOutlenSmaller — edge case where output is smaller than
8× input
- BenchmarkBytesToBools — parametric benchmark at 64B, 256B, 1KB, 4KB,
16KB
### Are there any user-facing changes?
No, this is purely a performance optimization:
*Benchmark Results (AMD Ryzen 7 7800X3D, linux/amd64)*
```
baseline (scalar) optimized (AVX2)
sec/op sec/op vs base
BytesToBools/bytes=64-16 146.0n 15.60n -89.32%
(p=0.008)
BytesToBools/bytes=256-16 562.3n 63.36n -88.73%
(p=0.008)
BytesToBools/bytes=1K-16 2.247µ 253.9n -88.70%
(p=0.008)
BytesToBools/bytes=4K-16 8.970µ 1.018µ -88.65%
(p=0.008)
BytesToBools/bytes=16K-16 35.798µ 4.044µ -88.70%
(p=0.008)
geomean 2.262µ 252.8n -88.82%
```
Throughput: 432 MiB/s → 3,853 MiB/s (+795%)
Zero allocations in both versions. All results statistically
significant.
---
parquet/internal/utils/unpack_bool.go | 6 +-
parquet/internal/utils/unpack_bool_avx2_amd64.s | 201 ++++++++++++---------
.../internal/utils/unpack_bool_benchmark_test.go | 105 +++++++++++
parquet/internal/utils/unpack_bool_sse4_amd64.s | 190 ++++++++++---------
4 files changed, 339 insertions(+), 163 deletions(-)
diff --git a/parquet/internal/utils/unpack_bool.go
b/parquet/internal/utils/unpack_bool.go
index 3ccb0b7b..07e61a4e 100644
--- a/parquet/internal/utils/unpack_bool.go
+++ b/parquet/internal/utils/unpack_bool.go
@@ -20,7 +20,11 @@ package utils
func bytesToBoolsGo(in []byte, out []bool) {
for i, b := range in {
for j := 0; j < 8; j++ {
- out[8*i+j] = (b & (1 << j)) != 0
+ idx := 8*i + j
+ if idx >= len(out) {
+ return
+ }
+ out[idx] = (b & (1 << j)) != 0
}
}
}
diff --git a/parquet/internal/utils/unpack_bool_avx2_amd64.s
b/parquet/internal/utils/unpack_bool_avx2_amd64.s
index 459ff786..6079fd11 100644
--- a/parquet/internal/utils/unpack_bool_avx2_amd64.s
+++ b/parquet/internal/utils/unpack_bool_avx2_amd64.s
@@ -1,88 +1,127 @@
//+build !noasm !appengine
-// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
-TEXT ·_bytes_to_bools_avx2(SB), $0-32
+// AVX2 vectorized bytes-to-bools using VPBROADCASTD + VPSHUFB + VPCMPEQB.
+// Processes 4 input bytes → 32 output bools per vector iteration.
+// Replaces the original c2goasm-generated scalar code which used zero SIMD.
+
+#include "textflag.h"
+
+// VPSHUFB operates on two independent 128-bit lanes.
+// Lower lane: byte 0 → lanes 0-7, byte 1 → lanes 8-15
+// Upper lane: byte 2 → lanes 16-23, byte 3 → lanes 24-31
+DATA shuffle_avx2<>+0x00(SB)/8, $0x0000000000000000
+DATA shuffle_avx2<>+0x08(SB)/8, $0x0101010101010101
+DATA shuffle_avx2<>+0x10(SB)/8, $0x0202020202020202
+DATA shuffle_avx2<>+0x18(SB)/8, $0x0303030303030303
+GLOBL shuffle_avx2<>(SB), (NOPTR+RODATA), $32
+
+// [1, 2, 4, 8, 16, 32, 64, 128] × 4
+DATA bitmask_avx2<>+0x00(SB)/8, $0x8040201008040201
+DATA bitmask_avx2<>+0x08(SB)/8, $0x8040201008040201
+DATA bitmask_avx2<>+0x10(SB)/8, $0x8040201008040201
+DATA bitmask_avx2<>+0x18(SB)/8, $0x8040201008040201
+GLOBL bitmask_avx2<>(SB), (NOPTR+RODATA), $32
+
+// [1, 1, 1, ...] × 32
+DATA ones_avx2<>+0x00(SB)/8, $0x0101010101010101
+DATA ones_avx2<>+0x08(SB)/8, $0x0101010101010101
+DATA ones_avx2<>+0x10(SB)/8, $0x0101010101010101
+DATA ones_avx2<>+0x18(SB)/8, $0x0101010101010101
+GLOBL ones_avx2<>(SB), (NOPTR+RODATA), $32
+
+TEXT ·_bytes_to_bools_avx2(SB), NOSPLIT, $0-32
MOVQ in+0(FP), DI
MOVQ len+8(FP), SI
MOVQ out+16(FP), DX
- MOVQ outlen+24(FP), CX
-
- WORD $0xf685 // test esi, esi
- JLE LBB0_5
- WORD $0x8941; BYTE $0xf0 // mov r8d, esi
- LONG $0x03e0c149 // shl r8, 3
- WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
- JMP LBB0_2
-
-LBB0_4:
- LONG $0x08c28349 // add r10, 8
- LONG $0x01c78348 // add rdi, 1
- WORD $0x394d; BYTE $0xd0 // cmp r8, r10
- JE LBB0_5
-
-LBB0_2:
- WORD $0x3941; BYTE $0xca // cmp r10d, ecx
- JGE LBB0_4
- WORD $0x8945; BYTE $0xd1 // mov r9d, r10d
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0x0124 // and al, 1
- LONG $0x0a048842 // mov byte [rdx + r9], al
- WORD $0x894c; BYTE $0xce // mov rsi, r9
- LONG $0x01ce8348 // or rsi, 1
- WORD $0xce39 // cmp esi, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8d0 // shr al, 1
- WORD $0x0124 // and al, 1
- WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
- WORD $0x894c; BYTE $0xce // mov rsi, r9
- LONG $0x02ce8348 // or rsi, 2
- WORD $0xce39 // cmp esi, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8c0; BYTE $0x02 // shr al, 2
- WORD $0x0124 // and al, 1
- WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
- WORD $0x894c; BYTE $0xce // mov rsi, r9
- LONG $0x03ce8348 // or rsi, 3
- WORD $0xce39 // cmp esi, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8c0; BYTE $0x03 // shr al, 3
- WORD $0x0124 // and al, 1
- WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
- WORD $0x894c; BYTE $0xce // mov rsi, r9
- LONG $0x04ce8348 // or rsi, 4
- WORD $0xce39 // cmp esi, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8c0; BYTE $0x04 // shr al, 4
- WORD $0x0124 // and al, 1
- WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
- WORD $0x894c; BYTE $0xce // mov rsi, r9
- LONG $0x05ce8348 // or rsi, 5
- WORD $0xce39 // cmp esi, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8c0; BYTE $0x05 // shr al, 5
- WORD $0x0124 // and al, 1
- WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
- WORD $0x894c; BYTE $0xce // mov rsi, r9
- LONG $0x06ce8348 // or rsi, 6
- WORD $0xce39 // cmp esi, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8c0; BYTE $0x06 // shr al, 6
- WORD $0x0124 // and al, 1
- WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
- LONG $0x07c98349 // or r9, 7
- WORD $0x3941; BYTE $0xc9 // cmp r9d, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8c0; BYTE $0x07 // shr al, 7
- LONG $0x0a048842 // mov byte [rdx + r9], al
- JMP LBB0_4
-
-LBB0_5:
+ MOVQ outlen+24(FP), R13
+
+ TESTL SI, SI
+ JLE done
+
+ VMOVDQU shuffle_avx2<>(SB), Y3
+ VMOVDQU bitmask_avx2<>(SB), Y4
+ VMOVDQU ones_avx2<>(SB), Y5
+
+ XORQ R8, R8
+ XORQ R9, R9
+
+loop32:
+ MOVQ SI, AX
+ SUBQ R8, AX
+ CMPQ AX, $4
+ JL loop8
+
+ MOVQ R13, AX
+ SUBQ R9, AX
+ CMPQ AX, $32
+ JL loop8
+
+ MOVL (DI)(R8*1), AX
+ MOVD AX, X0
+ VPBROADCASTD X0, Y0
+ VPSHUFB Y3, Y0, Y0
+ VPAND Y4, Y0, Y1
+ VPCMPEQB Y4, Y1, Y1
+ VPAND Y5, Y1, Y1
+ VMOVDQU Y1, (DX)(R9*1)
+
+ ADDQ $4, R8
+ ADDQ $32, R9
+ JMP loop32
+
+loop8:
+ CMPQ R8, SI
+ JGE avx_done
+
+ MOVQ R13, AX
+ SUBQ R9, AX
+ CMPQ AX, $8
+ JL scalar
+
+ MOVBLZX (DI)(R8*1), AX
+ MOVD AX, X0
+ VPBROADCASTD X0, Y0
+ VPSHUFB Y3, Y0, Y0
+ VPAND Y4, Y0, Y1
+ VPCMPEQB Y4, Y1, Y1
+ VPAND Y5, Y1, Y1
+ MOVQ X1, (DX)(R9*1)
+
+ ADDQ $1, R8
+ ADDQ $8, R9
+ JMP loop8
+
+scalar:
+ CMPQ R8, SI
+ JGE avx_done
+ CMPQ R9, R13
+ JGE avx_done
+
+ MOVBLZX (DI)(R8*1), AX
+ XORQ CX, CX
+
+scalar_bit:
+ CMPQ CX, $8
+ JGE scalar_next
+ CMPQ R9, R13
+ JGE avx_done
+
+ MOVL AX, R11
+ SHRL CL, R11
+ ANDL $1, R11
+ MOVB R11, (DX)(R9*1)
+
+ INCQ CX
+ INCQ R9
+ JMP scalar_bit
+
+scalar_next:
+ INCQ R8
+ JMP scalar
+
+avx_done:
+ VZEROUPPER
+
+done:
RET
diff --git a/parquet/internal/utils/unpack_bool_benchmark_test.go
b/parquet/internal/utils/unpack_bool_benchmark_test.go
new file mode 100644
index 00000000..e5d18b94
--- /dev/null
+++ b/parquet/internal/utils/unpack_bool_benchmark_test.go
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils_test
+
+import (
+ "fmt"
+ "math/rand"
+ "testing"
+
+ "github.com/apache/arrow-go/v18/parquet/internal/utils"
+)
+
+func BenchmarkBytesToBools(b *testing.B) {
+ for _, nBytes := range []int{64, 256, 1024, 4096, 16384} {
+ in := make([]byte, nBytes)
+ rng := rand.New(rand.NewSource(42))
+ for i := range in {
+ in[i] = byte(rng.Intn(256))
+ }
+ out := make([]bool, nBytes*8)
+
+ b.Run(fmt.Sprintf("bytes=%d", nBytes), func(b *testing.B) {
+ b.SetBytes(int64(nBytes))
+ for i := 0; i < b.N; i++ {
+ utils.BytesToBools(in, out)
+ }
+ })
+ }
+}
+
+func TestBytesToBoolsCorrectness(t *testing.T) {
+ rng := rand.New(rand.NewSource(12345))
+
+ for _, nBytes := range []int{1, 2, 3, 7, 8, 15, 16, 31, 32, 63, 64,
100, 256, 1024} {
+ t.Run(fmt.Sprintf("bytes=%d", nBytes), func(t *testing.T) {
+ in := make([]byte, nBytes)
+ for i := range in {
+ in[i] = byte(rng.Intn(256))
+ }
+
+ outlen := nBytes * 8
+ got := make([]bool, outlen)
+ want := make([]bool, outlen)
+
+ for i, b := range in {
+ for j := 0; j < 8; j++ {
+ want[8*i+j] = (b & (1 << j)) != 0
+ }
+ }
+
+ utils.BytesToBools(in, got)
+
+ for i := 0; i < outlen; i++ {
+ if got[i] != want[i] {
+ byteIdx := i / 8
+ bitIdx := i % 8
+ t.Fatalf("mismatch at index %d (byte
%d, bit %d): got %v, want %v (input byte = 0x%02x)",
+ i, byteIdx, bitIdx, got[i],
want[i], in[byteIdx])
+ }
+ }
+ })
+ }
+}
+
+func TestBytesToBoolsOutlenSmaller(t *testing.T) {
+ in := []byte{0xFF, 0xAA, 0x55}
+ for outlen := 1; outlen <= 24; outlen++ {
+ t.Run(fmt.Sprintf("outlen=%d", outlen), func(t *testing.T) {
+ got := make([]bool, outlen)
+ want := make([]bool, outlen)
+
+ for i, b := range in {
+ for j := 0; j < 8; j++ {
+ idx := 8*i + j
+ if idx >= outlen {
+ break
+ }
+ want[idx] = (b & (1 << j)) != 0
+ }
+ }
+
+ utils.BytesToBools(in, got)
+
+ for i := 0; i < outlen; i++ {
+ if got[i] != want[i] {
+ t.Fatalf("outlen=%d: mismatch at index
%d: got %v, want %v", outlen, i, got[i], want[i])
+ }
+ }
+ })
+ }
+}
diff --git a/parquet/internal/utils/unpack_bool_sse4_amd64.s
b/parquet/internal/utils/unpack_bool_sse4_amd64.s
index ac8acb9f..a8d9fc88 100644
--- a/parquet/internal/utils/unpack_bool_sse4_amd64.s
+++ b/parquet/internal/utils/unpack_bool_sse4_amd64.s
@@ -1,88 +1,116 @@
//+build !noasm !appengine
-// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
-TEXT ·_bytes_to_bools_sse4(SB), $0-32
+// SSE4 vectorized bytes-to-bools using PSHUFB broadcast + PCMPEQB bit-test.
+// Processes 2 input bytes → 16 output bools per vector iteration.
+// Replaces the original c2goasm-generated scalar code which used zero SIMD.
+
+#include "textflag.h"
+
+// broadcast byte 0 → lanes 0-7, byte 1 → lanes 8-15
+DATA shuffle_sse4<>+0x00(SB)/8, $0x0000000000000000
+DATA shuffle_sse4<>+0x08(SB)/8, $0x0101010101010101
+GLOBL shuffle_sse4<>(SB), (NOPTR+RODATA), $16
+
+// [1, 2, 4, 8, 16, 32, 64, 128] × 2
+DATA bitmask_sse4<>+0x00(SB)/8, $0x8040201008040201
+DATA bitmask_sse4<>+0x08(SB)/8, $0x8040201008040201
+GLOBL bitmask_sse4<>(SB), (NOPTR+RODATA), $16
+
+// [1, 1, 1, ...] × 16
+DATA ones_sse4<>+0x00(SB)/8, $0x0101010101010101
+DATA ones_sse4<>+0x08(SB)/8, $0x0101010101010101
+GLOBL ones_sse4<>(SB), (NOPTR+RODATA), $16
+
+TEXT ·_bytes_to_bools_sse4(SB), NOSPLIT, $0-32
MOVQ in+0(FP), DI
MOVQ len+8(FP), SI
MOVQ out+16(FP), DX
- MOVQ outlen+24(FP), CX
-
- WORD $0xf685 // test esi, esi
- JLE LBB0_5
- WORD $0x8941; BYTE $0xf0 // mov r8d, esi
- LONG $0x03e0c149 // shl r8, 3
- WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
- JMP LBB0_2
-
-LBB0_4:
- LONG $0x08c28349 // add r10, 8
- LONG $0x01c78348 // add rdi, 1
- WORD $0x394d; BYTE $0xd0 // cmp r8, r10
- JE LBB0_5
-
-LBB0_2:
- WORD $0x3941; BYTE $0xca // cmp r10d, ecx
- JGE LBB0_4
- WORD $0x8945; BYTE $0xd1 // mov r9d, r10d
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0x0124 // and al, 1
- LONG $0x0a048842 // mov byte [rdx + r9], al
- WORD $0x894c; BYTE $0xce // mov rsi, r9
- LONG $0x01ce8348 // or rsi, 1
- WORD $0xce39 // cmp esi, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8d0 // shr al, 1
- WORD $0x0124 // and al, 1
- WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
- WORD $0x894c; BYTE $0xce // mov rsi, r9
- LONG $0x02ce8348 // or rsi, 2
- WORD $0xce39 // cmp esi, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8c0; BYTE $0x02 // shr al, 2
- WORD $0x0124 // and al, 1
- WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
- WORD $0x894c; BYTE $0xce // mov rsi, r9
- LONG $0x03ce8348 // or rsi, 3
- WORD $0xce39 // cmp esi, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8c0; BYTE $0x03 // shr al, 3
- WORD $0x0124 // and al, 1
- WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
- WORD $0x894c; BYTE $0xce // mov rsi, r9
- LONG $0x04ce8348 // or rsi, 4
- WORD $0xce39 // cmp esi, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8c0; BYTE $0x04 // shr al, 4
- WORD $0x0124 // and al, 1
- WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
- WORD $0x894c; BYTE $0xce // mov rsi, r9
- LONG $0x05ce8348 // or rsi, 5
- WORD $0xce39 // cmp esi, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8c0; BYTE $0x05 // shr al, 5
- WORD $0x0124 // and al, 1
- WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
- WORD $0x894c; BYTE $0xce // mov rsi, r9
- LONG $0x06ce8348 // or rsi, 6
- WORD $0xce39 // cmp esi, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8c0; BYTE $0x06 // shr al, 6
- WORD $0x0124 // and al, 1
- WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
- LONG $0x07c98349 // or r9, 7
- WORD $0x3941; BYTE $0xc9 // cmp r9d, ecx
- JGE LBB0_4
- WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
- WORD $0xe8c0; BYTE $0x07 // shr al, 7
- LONG $0x0a048842 // mov byte [rdx + r9], al
- JMP LBB0_4
-
-LBB0_5:
+ MOVQ outlen+24(FP), R13
+
+ TESTL SI, SI
+ JLE done
+
+ MOVOU shuffle_sse4<>(SB), X3
+ MOVOU bitmask_sse4<>(SB), X4
+ MOVOU ones_sse4<>(SB), X5
+
+ XORQ R8, R8
+ XORQ R9, R9
+
+loop16:
+ MOVQ SI, AX
+ SUBQ R8, AX
+ CMPQ AX, $2
+ JL loop8
+
+ MOVQ R13, AX
+ SUBQ R9, AX
+ CMPQ AX, $16
+ JL loop8
+
+ MOVWLZX (DI)(R8*1), AX
+ MOVD AX, X0
+ PSHUFB X3, X0
+ MOVOU X0, X1
+ PAND X4, X1
+ PCMPEQB X4, X1
+ PAND X5, X1
+ MOVOU X1, (DX)(R9*1)
+
+ ADDQ $2, R8
+ ADDQ $16, R9
+ JMP loop16
+
+loop8:
+ CMPQ R8, SI
+ JGE done
+
+ MOVQ R13, AX
+ SUBQ R9, AX
+ CMPQ AX, $8
+ JL scalar
+
+ MOVBLZX (DI)(R8*1), AX
+ MOVD AX, X0
+ PXOR X6, X6
+ PSHUFB X6, X0
+ PAND X4, X0
+ PCMPEQB X4, X0
+ PAND X5, X0
+ MOVQ X0, (DX)(R9*1)
+
+ ADDQ $1, R8
+ ADDQ $8, R9
+ JMP loop8
+
+scalar:
+ CMPQ R8, SI
+ JGE done
+ CMPQ R9, R13
+ JGE done
+
+ MOVBLZX (DI)(R8*1), AX
+ XORQ CX, CX
+
+scalar_bit:
+ CMPQ CX, $8
+ JGE scalar_next
+ CMPQ R9, R13
+ JGE done
+
+ MOVL AX, R11
+ SHRL CL, R11
+ ANDL $1, R11
+ MOVB R11, (DX)(R9*1)
+
+ INCQ CX
+ INCQ R9
+ JMP scalar_bit
+
+scalar_next:
+ INCQ R8
+ JMP scalar
+
+done:
RET