crc: add x86 SSE4.2 clmul SIMD implementation for av_crc

Shreesh Adiga via ffmpeg-cvslog Sun, 04 Jan 2026 10:27:39 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 1b6571c765def1186ce1d78cc38628eacbdd1492
Author:     Shreesh Adiga <[email protected]>
AuthorDate: Sun Oct 26 16:07:17 2025 +0530
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Jan 4 15:49:30 2026 +0100

    avutil/crc: add x86 SSE4.2 clmul SIMD implementation for av_crc
    
    Implemented the algorithm described in the paper titled
    "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
    by Intel.
    It is not used yet; the integration will be added in a separate commit.
    
    Observed near 10x speedup on AMD Zen4 7950x:
    av_crc_c:                                            22057.0 ( 1.00x)
    av_crc_clmul:                                         2202.8 (10.01x)
---
 libavutil/x86/crc.asm | 297 ++++++++++++++++++++++++++++++++++++++++++++++++++
 libavutil/x86/crc.h   | 167 ++++++++++++++++++++++++++++
 2 files changed, 464 insertions(+)

diff --git a/libavutil/x86/crc.asm b/libavutil/x86/crc.asm
new file mode 100644
index 0000000000..95cf90d250
--- /dev/null
+++ b/libavutil/x86/crc.asm
@@ -0,0 +1,297 @@
+;*****************************************************************************
+;* Copyright (c) 2025 Shreesh Adiga <[email protected]>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86util.asm"
+
+SECTION_RODATA
+reverse_shuffle: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+partial_bytes_shuf_tab: db 255, 254, 253, 252, 251, 250, 249, 248,\
+                           247, 246, 245, 244, 243, 242, 241, 240,\
+                             0,   1,   2,   3,   4,   5,   6,   7,\
+                             8,   9,  10,  11,  12,  13,  14,  15
+
+SECTION .text
+
+%macro FOLD_128_TO_64 4
+; %1 LE ; %2 128 bit fold reg ; %3 pre-computed constant reg ; %4 tmp reg
+%if %1 == 1
+    mova      %4, %2
+    pclmulqdq %2, %3, 0x00
+    psrldq    %4, 8
+    pxor      %2, %4
+    mova      %4, %2
+    psllq     %4, 32
+    pclmulqdq %4, %3, 0x10
+    pxor      %2, %4
+%else
+    movq      %4, %2
+    pclmulqdq %2, %3, 0x11
+    pslldq    %4, 4
+    pxor      %4, %2
+    mova      %2, %4
+    pclmulqdq %4, %3, 0x01
+    pxor      %2, %4
+%endif
+%endmacro
+
+%macro FOLD_64_TO_32 4
+; %1 LE ; %2 128 bit fold reg ; %3 pre-computed constant reg ; %4 tmp reg
+%if %1 == 1
+    pxor      %4, %4
+    pblendw   %4, %2, 0xfc
+    mova      %2, %4
+    pclmulqdq %4, %3, 0x00
+    pxor      %4, %2
+    pclmulqdq %4, %3, 0x10
+    pxor      %2, %4
+    pextrd   eax, %2, 2
+%else
+    mova      %4, %2
+    pclmulqdq %2, %3, 0x00
+    pclmulqdq %2, %3, 0x11
+    pxor      %2, %4
+    movd     eax, %2
+    bswap    eax
+%endif
+%endmacro
+
+%macro FOLD_SINGLE 4
+; %1 temp ; %2 fold reg ; %3 pre-computed constants ; %4 input data block
+    mova      %1, %2
+    pclmulqdq %1, %3, 0x01
+    pxor      %1, %4
+    pclmulqdq %2, %3, 0x10
+    pxor      %2, %1
+%endmacro
+
+%macro XMM_SHIFT_LEFT 4
+; %1 xmm input reg ; %2 shift bytes amount ; %3 temp xmm register ; %4 temp gpr
+    lea    %4, [partial_bytes_shuf_tab]
+    movu   %3, [%4 + 16 - (%2)]
+    pshufb %1, %3
+%endmacro
+
+%macro MEMCPY_0_15 6
+; %1 dst ; %2 src ; %3 len ; %4, %5 temp gpr register; %6 done label
+    cmp %3, 8
+    jae .between_8_15
+    cmp %3, 4
+    jae .between_4_7
+    cmp %3, 1
+    ja .between_2_3
+    jb %6
+    mov  %4b, [%2]
+    mov [%1], %4b
+    jmp %6
+
+.between_8_15:
+%if ARCH_X86_64
+    mov           %4q, [%2]
+    mov           %5q, [%2 + %3 - 8]
+    mov          [%1], %4q
+    mov [%1 + %3 - 8], %5q
+    jmp %6
+%else
+    xor            %5, %5
+.copy4b:
+        mov       %4d, [%2 + %5]
+        mov [%1 + %5], %4d
+        add        %5, 4
+        lea        %4, [%5 + 4]
+        cmp        %4, %3
+        jb        .copy4b
+
+    mov           %4d, [%2 + %3 - 4]
+    mov [%1 + %3 - 4], %4d
+    jmp %6
+%endif
+.between_4_7:
+    mov           %4d, [%2]
+    mov           %5d, [%2 + %3 - 4]
+    mov          [%1], %4d
+    mov [%1 + %3 - 4], %5d
+    jmp %6
+.between_2_3:
+    mov           %4w, [%2]
+    mov           %5w, [%2 + %3 - 2]
+    mov          [%1], %4w
+    mov [%1 + %3 - 2], %5w
+    ; fall through, %6 label is expected to be next instruction
+%endmacro
+
+%macro CRC 1
+;-----------------------------------------------------------------------------------------------
+; ff_crc[_le]_clmul(const uint8_t *ctx, uint32_t crc, const uint8_t *buffer, 
size_t length
+;-----------------------------------------------------------------------------------------------
+; %1 == 1 - LE format
+%if %1 == 1
+cglobal crc_le, 4, 6, 6+4*ARCH_X86_64, 0x10
+%else
+cglobal crc,    4, 6, 7+4*ARCH_X86_64, 0x10
+%endif
+
+%if ARCH_X86_32
+    %define m10 m6
+%endif
+
+%if %1 == 0
+    mova  m10, [reverse_shuffle]
+%endif
+
+    movd   m4, r1d
+%if ARCH_X86_32
+    ; skip 4x unrolled loop due to only 8 XMM reg being available in X86_32
+    jmp   .less_than_64bytes
+%else
+    cmp    r3, 64
+    jb    .less_than_64bytes
+    movu   m1, [r2 +  0]
+    movu   m3, [r2 + 16]
+    movu   m2, [r2 + 32]
+    movu   m0, [r2 + 48]
+    pxor   m1, m4
+%if %1 == 0
+    pshufb m0, m10
+    pshufb m1, m10
+    pshufb m2, m10
+    pshufb m3, m10
+%endif
+    mov    r4, 64
+    cmp    r3, 128
+    jb    .reduce_4x_to_1
+    movu   m4, [r0]
+
+.fold_4x_loop:
+        movu        m6, [r2 + r4 +  0]
+        movu        m7, [r2 + r4 + 16]
+        movu        m8, [r2 + r4 + 32]
+        movu        m9, [r2 + r4 + 48]
+%if %1 == 0
+        pshufb      m6, m10
+        pshufb      m7, m10
+        pshufb      m8, m10
+        pshufb      m9, m10
+%endif
+        FOLD_SINGLE m5, m1, m4, m6
+        FOLD_SINGLE m5, m3, m4, m7
+        FOLD_SINGLE m5, m2, m4, m8
+        FOLD_SINGLE m5, m0, m4, m9
+        add         r4, 64
+        lea         r5, [r4 + 64]
+        cmp         r5, r3
+        jbe        .fold_4x_loop
+
+.reduce_4x_to_1:
+    movu        m4, [r0 + 16]
+    FOLD_SINGLE m5, m1, m4, m3
+    FOLD_SINGLE m5, m1, m4, m2
+    FOLD_SINGLE m5, m1, m4, m0
+%endif
+
+.fold_1x_pre:
+    lea  r5, [r4 + 16]
+    cmp  r5, r3
+    ja  .partial_block
+
+.fold_1x_loop:
+        movu        m2, [r2 + r4]
+%if %1 == 0
+        pshufb      m2, m10
+%endif
+        FOLD_SINGLE m5, m1, m4, m2
+        add         r4, 16
+        lea         r5, [r4 + 16]
+        cmp         r5, r3
+        jbe        .fold_1x_loop
+
+.partial_block:
+    cmp         r4, r3
+    jae        .reduce_128_to_64
+    movu        m2, [r2 + r3 - 16]
+    and         r3, 0xf
+    lea         r4, [partial_bytes_shuf_tab]
+    movu        m0, [r3 + r4]
+%if %1 == 0
+    pshufb      m1, m10
+%endif
+    mova        m3, m1
+    pcmpeqd     m5, m5 ; m5 = _mm_set1_epi8(0xff)
+    pxor        m5, m0
+    pshufb      m3, m5
+    pblendvb    m2, m3, m0
+    pshufb      m1, m0
+%if %1 == 0
+    pshufb      m1, m10
+    pshufb      m2, m10
+%endif
+    FOLD_SINGLE m5, m1, m4, m2
+
+.reduce_128_to_64:
+    movu           m4, [r0 + 32]
+    FOLD_128_TO_64 %1, m1, m4, m5
+.reduce_64_to_32:
+    movu           m4, [r0 + 48]
+    FOLD_64_TO_32  %1, m1, m4, m5
+    RET
+
+.less_than_64bytes:
+    cmp    r3, 16
+    jb    .less_than_16bytes
+    movu   m1, [r2]
+    pxor   m1, m4
+%if %1 == 0
+    pshufb m1, m10
+%endif
+    mov    r4, 16
+    movu   m4, [r0 + 16]
+    jmp   .fold_1x_pre
+
+.less_than_16bytes:
+    pxor           m1, m1
+    movu        [rsp], m1
+    MEMCPY_0_15   rsp, r2, r3, r1, r4, .memcpy_done
+
+.memcpy_done:
+    movu           m1, [rsp]
+    pxor           m1, m4
+    cmp            r3, 5
+    jb            .less_than_5bytes
+    XMM_SHIFT_LEFT m1, (16 - r3), m2, r4
+%if %1 == 0
+    pshufb         m1, m10
+%endif
+    jmp           .reduce_128_to_64
+
+.less_than_5bytes:
+%if %1 == 0
+    XMM_SHIFT_LEFT m1, (4 - r3), m2, r4
+    movq          m10, [reverse_shuffle + 8] ; 0x0001020304050607
+    pshufb         m1, m10
+%else
+    XMM_SHIFT_LEFT m1, (8 - r3), m2, r4
+%endif
+    jmp .reduce_64_to_32
+
+%endmacro
+
+INIT_XMM clmul
+CRC 0
+CRC 1
diff --git a/libavutil/x86/crc.h b/libavutil/x86/crc.h
new file mode 100644
index 0000000000..5fabfa7570
--- /dev/null
+++ b/libavutil/x86/crc.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2025 Shreesh Adiga <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_CRC_H
+#define AVUTIL_X86_CRC_H
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/attributes_internal.h"
+#include "libavutil/cpu.h"
+#include "libavutil/crc.h"
+#include "libavutil/reverse.h"
+#include "libavutil/x86/cpu.h"
+
+#if HAVE_CLMUL_EXTERNAL
+FF_VISIBILITY_PUSH_HIDDEN
+uint32_t ff_crc_clmul(const AVCRC *ctx, uint32_t crc,
+                      const uint8_t *buffer, size_t length);
+uint32_t ff_crc_le_clmul(const AVCRC *ctx, uint32_t crc,
+                         const uint8_t *buffer, size_t length);
+FF_VISIBILITY_POP_HIDDEN
+
+static const AVCRC crc_table_clmul[AV_CRC_MAX][16] = {
+    [AV_CRC_8_ATM] = {
+        0x32000000, 0x0, 0xbc000000, 0x0,
+        0xc4000000, 0x0, 0x94000000, 0x0,
+        0x62000000, 0x0, 0x79000000, 0x0,
+        0x07156a16, 0x1, 0x07000000, 0x1,
+    },
+    [AV_CRC_8_EBU] = {
+        0xb5000000, 0x0, 0xf3000000, 0x0,
+        0xfc000000, 0x0, 0x0d000000, 0x0,
+        0x6a000000, 0x0, 0x65000000, 0x0,
+        0x1c4b8192, 0x1, 0x1d000000, 0x1,
+    },
+    [AV_CRC_16_ANSI] = {
+        0xf9e30000, 0x0, 0x807d0000, 0x0,
+        0xf9130000, 0x0, 0xff830000, 0x0,
+        0x807b0000, 0x0, 0x86630000, 0x0,
+        0xfffbffe7, 0x1, 0x80050000, 0x1,
+    },
+    [AV_CRC_16_CCITT] = {
+        0x60190000, 0x0, 0x59b00000, 0x0,
+        0xd5f60000, 0x0, 0x45630000, 0x0,
+        0xaa510000, 0x0, 0xeb230000, 0x0,
+        0x11303471, 0x1, 0x10210000, 0x1,
+    },
+    [AV_CRC_24_IEEE] = {
+        0x1f428700, 0x0, 0x467d2400, 0x0,
+        0x2c8c9d00, 0x0, 0x64e4d700, 0x0,
+        0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0,
+        0xf845fe24, 0x1, 0x864cfb00, 0x1,
+    },
+    [AV_CRC_32_IEEE] = {
+        0x8833794c, 0x0, 0xe6228b11, 0x0,
+        0xc5b9cd4c, 0x0, 0xe8a45605, 0x0,
+        0x490d678d, 0x0, 0xf200aa66, 0x0,
+        0x04d101df, 0x1, 0x04c11db7, 0x1,
+    },
+    [AV_CRC_32_IEEE_LE] = {
+        0xc6e41596, 0x1, 0x54442bd4, 0x1,
+        0xccaa009e, 0x0, 0x751997d0, 0x1,
+        0xccaa009e, 0x0, 0x63cd6124, 0x1,
+        0xf7011640, 0x1, 0xdb710641, 0x1,
+    },
+    [AV_CRC_16_ANSI_LE] = {
+        0x0000bffa, 0x0, 0x1b0c2, 0x0,
+        0x00018cc2, 0x0, 0x1d0c2, 0x0,
+        0x00018cc2, 0x0, 0x1bc02, 0x0,
+        0xcfffbffe, 0x1, 0x14003, 0x0,
+    },
+};
+
+static uint64_t reverse(uint64_t p, unsigned int deg)
+{
+    uint64_t ret = 0;
+    int i;
+    for (i = 0; i < (deg / 8); i += 1) {
+        ret = (ret << 8) | (ff_reverse[p & 0xff]);
+        p >>= 8;
+    }
+    int rem = (deg + 1) - 8 * i;
+    ret = (ret << rem) | (ff_reverse[p & 0xff] >> (8 - rem));
+    return ret;
+}
+
+static uint64_t xnmodp(unsigned n, uint64_t poly, unsigned deg, uint64_t *div, 
int bitreverse)
+{
+    uint64_t mod, mask, high;
+
+    if (n < deg) {
+        *div = 0;
+        return poly;
+    }
+    mask = ((uint64_t)1 << deg) - 1;
+    poly &= mask;
+    mod = poly;
+    *div = 1;
+    deg--;
+    while (--n > deg) {
+        high = (mod >> deg) & 1;
+        *div = (*div << 1) | high;
+        mod <<= 1;
+        if (high)
+            mod ^= poly;
+    }
+    uint64_t ret = mod & mask;
+    if (bitreverse) {
+        *div = reverse(*div, deg) << 1;
+        return reverse(ret, deg) << 1;
+    }
+    return ret;
+}
+
+static inline void crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, 
int ctx_size)
+{
+    uint64_t poly_;
+    if (le) {
+        // convert the reversed representation to regular form
+        poly = reverse(poly, bits) >> 1;
+    }
+    // convert to 32 degree polynomial
+    poly_ = ((uint64_t)poly) << (32 - bits);
+
+    uint64_t x1, x2, x3, x4, x5, x6, x7, x8, div;
+    uint8_t *dst = (uint8_t*)ctx;
+    if (le) {
+        AV_WN64(dst,      xnmodp(4 * 128 - 32, poly_, 32, &div, le));
+        AV_WN64(dst +  8, xnmodp(4 * 128 + 32, poly_, 32, &div, le));
+        uint64_t tmp = xnmodp(128 - 32, poly_, 32, &div, le);
+        AV_WN64(dst + 16, tmp);
+        AV_WN64(dst + 24, xnmodp(128 + 32, poly_, 32, &div, le));
+        AV_WN64(dst + 32, tmp);
+        AV_WN64(dst + 40, xnmodp(64, poly_, 32, &div, le));
+        AV_WN64(dst + 48, div);
+        AV_WN64(dst + 56, reverse(poly_ | (1ULL << 32), 32));
+    } else {
+        AV_WN64(dst,      xnmodp(4 * 128 + 64, poly_, 32, &div, le));
+        AV_WN64(dst +  8, xnmodp(4 * 128, poly_, 32, &div, le));
+        AV_WN64(dst + 16, xnmodp(128 + 64, poly_, 32, &div, le));
+        AV_WN64(dst + 24, xnmodp(128, poly_, 32, &div, le));
+        AV_WN64(dst + 32, xnmodp(64, poly_, 32, &div, le));
+        AV_WN64(dst + 48, div);
+        AV_WN64(dst + 40, xnmodp(96, poly_, 32, &div, le));
+        AV_WN64(dst + 56, poly_ | (1ULL << 32));
+    }
+}
+#endif
+
+#endif /* AVUTIL_X86_CRC_H */

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 05/11: avutil/crc: add x86 SSE4.2 clmul SIMD implementation for av_crc

Reply via email to