This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 1b6571c765def1186ce1d78cc38628eacbdd1492 Author: Shreesh Adiga <[email protected]> AuthorDate: Sun Oct 26 16:07:17 2025 +0530 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Sun Jan 4 15:49:30 2026 +0100 avutil/crc: add x86 SSE4.2 clmul SIMD implementation for av_crc Implemented the algorithm described in the paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" by Intel. It is not used yet; the integration will be added in a separate commit. Observed near 10x speedup on AMD Zen4 7950x: av_crc_c: 22057.0 ( 1.00x) av_crc_clmul: 2202.8 (10.01x) --- libavutil/x86/crc.asm | 297 ++++++++++++++++++++++++++++++++++++++++++++++++++ libavutil/x86/crc.h | 167 ++++++++++++++++++++++++++++ 2 files changed, 464 insertions(+) diff --git a/libavutil/x86/crc.asm b/libavutil/x86/crc.asm new file mode 100644 index 0000000000..95cf90d250 --- /dev/null +++ b/libavutil/x86/crc.asm @@ -0,0 +1,297 @@ +;***************************************************************************** +;* Copyright (c) 2025 Shreesh Adiga <[email protected]> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86util.asm" + +SECTION_RODATA +reverse_shuffle: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +partial_bytes_shuf_tab: db 255, 254, 253, 252, 251, 250, 249, 248,\ + 247, 246, 245, 244, 243, 242, 241, 240,\ + 0, 1, 2, 3, 4, 5, 6, 7,\ + 8, 9, 10, 11, 12, 13, 14, 15 + +SECTION .text + +%macro FOLD_128_TO_64 4 +; %1 LE ; %2 128 bit fold reg ; %3 pre-computed constant reg ; %4 tmp reg +%if %1 == 1 + mova %4, %2 + pclmulqdq %2, %3, 0x00 + psrldq %4, 8 + pxor %2, %4 + mova %4, %2 + psllq %4, 32 + pclmulqdq %4, %3, 0x10 + pxor %2, %4 +%else + movq %4, %2 + pclmulqdq %2, %3, 0x11 + pslldq %4, 4 + pxor %4, %2 + mova %2, %4 + pclmulqdq %4, %3, 0x01 + pxor %2, %4 +%endif +%endmacro + +%macro FOLD_64_TO_32 4 +; %1 LE ; %2 128 bit fold reg ; %3 pre-computed constant reg ; %4 tmp reg +%if %1 == 1 + pxor %4, %4 + pblendw %4, %2, 0xfc + mova %2, %4 + pclmulqdq %4, %3, 0x00 + pxor %4, %2 + pclmulqdq %4, %3, 0x10 + pxor %2, %4 + pextrd eax, %2, 2 +%else + mova %4, %2 + pclmulqdq %2, %3, 0x00 + pclmulqdq %2, %3, 0x11 + pxor %2, %4 + movd eax, %2 + bswap eax +%endif +%endmacro + +%macro FOLD_SINGLE 4 +; %1 temp ; %2 fold reg ; %3 pre-computed constants ; %4 input data block + mova %1, %2 + pclmulqdq %1, %3, 0x01 + pxor %1, %4 + pclmulqdq %2, %3, 0x10 + pxor %2, %1 +%endmacro + +%macro XMM_SHIFT_LEFT 4 +; %1 xmm input reg ; %2 shift bytes amount ; %3 temp xmm register ; %4 temp gpr + lea %4, [partial_bytes_shuf_tab] + movu %3, [%4 + 16 - (%2)] + pshufb %1, %3 +%endmacro + +%macro MEMCPY_0_15 6 +; %1 dst ; %2 src ; %3 len ; %4, %5 temp gpr register; %6 done label + cmp %3, 8 + jae .between_8_15 + cmp %3, 4 + jae .between_4_7 + cmp %3, 1 + ja .between_2_3 + jb %6 + mov %4b, [%2] + mov [%1], %4b + jmp %6 + +.between_8_15: +%if ARCH_X86_64 + mov %4q, [%2] + mov %5q, [%2 + %3 - 8] + mov [%1], %4q + mov [%1 + %3 - 8], %5q + jmp %6 +%else + xor %5, %5 +.copy4b: + mov %4d, [%2 + %5] + mov [%1 + %5], %4d + add %5, 4 + lea %4, [%5 + 4] + cmp %4, %3 + jb .copy4b + + mov %4d, [%2 + %3 - 4] + mov [%1 + %3 - 4], %4d + jmp %6 +%endif +.between_4_7: + mov %4d, [%2] + mov %5d, [%2 + %3 - 4] + mov [%1], %4d + mov [%1 + %3 - 4], %5d + jmp %6 +.between_2_3: + mov %4w, [%2] + mov %5w, [%2 + %3 - 2] + mov [%1], %4w + mov [%1 + %3 - 2], %5w + ; fall through, %6 label is expected to be next instruction +%endmacro + +%macro CRC 1 +;----------------------------------------------------------------------------------------------- +; ff_crc[_le]_clmul(const uint8_t *ctx, uint32_t crc, const uint8_t *buffer, size_t length +;----------------------------------------------------------------------------------------------- +; %1 == 1 - LE format +%if %1 == 1 +cglobal crc_le, 4, 6, 6+4*ARCH_X86_64, 0x10 +%else +cglobal crc, 4, 6, 7+4*ARCH_X86_64, 0x10 +%endif + +%if ARCH_X86_32 + %define m10 m6 +%endif + +%if %1 == 0 + mova m10, [reverse_shuffle] +%endif + + movd m4, r1d +%if ARCH_X86_32 + ; skip 4x unrolled loop due to only 8 XMM reg being available in X86_32 + jmp .less_than_64bytes +%else + cmp r3, 64 + jb .less_than_64bytes + movu m1, [r2 + 0] + movu m3, [r2 + 16] + movu m2, [r2 + 32] + movu m0, [r2 + 48] + pxor m1, m4 +%if %1 == 0 + pshufb m0, m10 + pshufb m1, m10 + pshufb m2, m10 + pshufb m3, m10 +%endif + mov r4, 64 + cmp r3, 128 + jb .reduce_4x_to_1 + movu m4, [r0] + +.fold_4x_loop: + movu m6, [r2 + r4 + 0] + movu m7, [r2 + r4 + 16] + movu m8, [r2 + r4 + 32] + movu m9, [r2 + r4 + 48] +%if %1 == 0 + pshufb m6, m10 + pshufb m7, m10 + pshufb m8, m10 + pshufb m9, m10 +%endif + FOLD_SINGLE m5, m1, m4, m6 + FOLD_SINGLE m5, m3, m4, m7 + FOLD_SINGLE m5, m2, m4, m8 + FOLD_SINGLE m5, m0, m4, m9 + add r4, 64 + lea r5, [r4 + 64] + cmp r5, r3 + jbe .fold_4x_loop + +.reduce_4x_to_1: + movu m4, [r0 + 16] + FOLD_SINGLE m5, m1, m4, m3 + FOLD_SINGLE m5, m1, m4, m2 + FOLD_SINGLE m5, m1, m4, m0 +%endif + +.fold_1x_pre: + lea r5, [r4 + 16] + cmp r5, r3 + ja .partial_block + +.fold_1x_loop: + movu m2, [r2 + r4] +%if %1 == 0 + pshufb m2, m10 +%endif + FOLD_SINGLE m5, m1, m4, m2 + add r4, 16 + lea r5, [r4 + 16] + cmp r5, r3 + jbe .fold_1x_loop + +.partial_block: + cmp r4, r3 + jae .reduce_128_to_64 + movu m2, [r2 + r3 - 16] + and r3, 0xf + lea r4, [partial_bytes_shuf_tab] + movu m0, [r3 + r4] +%if %1 == 0 + pshufb m1, m10 +%endif + mova m3, m1 + pcmpeqd m5, m5 ; m5 = _mm_set1_epi8(0xff) + pxor m5, m0 + pshufb m3, m5 + pblendvb m2, m3, m0 + pshufb m1, m0 +%if %1 == 0 + pshufb m1, m10 + pshufb m2, m10 +%endif + FOLD_SINGLE m5, m1, m4, m2 + +.reduce_128_to_64: + movu m4, [r0 + 32] + FOLD_128_TO_64 %1, m1, m4, m5 +.reduce_64_to_32: + movu m4, [r0 + 48] + FOLD_64_TO_32 %1, m1, m4, m5 + RET + +.less_than_64bytes: + cmp r3, 16 + jb .less_than_16bytes + movu m1, [r2] + pxor m1, m4 +%if %1 == 0 + pshufb m1, m10 +%endif + mov r4, 16 + movu m4, [r0 + 16] + jmp .fold_1x_pre + +.less_than_16bytes: + pxor m1, m1 + movu [rsp], m1 + MEMCPY_0_15 rsp, r2, r3, r1, r4, .memcpy_done + +.memcpy_done: + movu m1, [rsp] + pxor m1, m4 + cmp r3, 5 + jb .less_than_5bytes + XMM_SHIFT_LEFT m1, (16 - r3), m2, r4 +%if %1 == 0 + pshufb m1, m10 +%endif + jmp .reduce_128_to_64 + +.less_than_5bytes: +%if %1 == 0 + XMM_SHIFT_LEFT m1, (4 - r3), m2, r4 + movq m10, [reverse_shuffle + 8] ; 0x0001020304050607 + pshufb m1, m10 +%else + XMM_SHIFT_LEFT m1, (8 - r3), m2, r4 +%endif + jmp .reduce_64_to_32 + +%endmacro + +INIT_XMM clmul +CRC 0 +CRC 1 diff --git a/libavutil/x86/crc.h b/libavutil/x86/crc.h new file mode 100644 index 0000000000..5fabfa7570 --- /dev/null +++ b/libavutil/x86/crc.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2025 Shreesh Adiga <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_CRC_H +#define AVUTIL_X86_CRC_H + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/attributes_internal.h" +#include "libavutil/cpu.h" +#include "libavutil/crc.h" +#include "libavutil/reverse.h" +#include "libavutil/x86/cpu.h" + +#if HAVE_CLMUL_EXTERNAL +FF_VISIBILITY_PUSH_HIDDEN +uint32_t ff_crc_clmul(const AVCRC *ctx, uint32_t crc, + const uint8_t *buffer, size_t length); +uint32_t ff_crc_le_clmul(const AVCRC *ctx, uint32_t crc, + const uint8_t *buffer, size_t length); +FF_VISIBILITY_POP_HIDDEN + +static const AVCRC crc_table_clmul[AV_CRC_MAX][16] = { + [AV_CRC_8_ATM] = { + 0x32000000, 0x0, 0xbc000000, 0x0, + 0xc4000000, 0x0, 0x94000000, 0x0, + 0x62000000, 0x0, 0x79000000, 0x0, + 0x07156a16, 0x1, 0x07000000, 0x1, + }, + [AV_CRC_8_EBU] = { + 0xb5000000, 0x0, 0xf3000000, 0x0, + 0xfc000000, 0x0, 0x0d000000, 0x0, + 0x6a000000, 0x0, 0x65000000, 0x0, + 0x1c4b8192, 0x1, 0x1d000000, 0x1, + }, + [AV_CRC_16_ANSI] = { + 0xf9e30000, 0x0, 0x807d0000, 0x0, + 0xf9130000, 0x0, 0xff830000, 0x0, + 0x807b0000, 0x0, 0x86630000, 0x0, + 0xfffbffe7, 0x1, 0x80050000, 0x1, + }, + [AV_CRC_16_CCITT] = { + 0x60190000, 0x0, 0x59b00000, 0x0, + 0xd5f60000, 0x0, 0x45630000, 0x0, + 0xaa510000, 0x0, 0xeb230000, 0x0, + 0x11303471, 0x1, 0x10210000, 0x1, + }, + [AV_CRC_24_IEEE] = { + 0x1f428700, 0x0, 0x467d2400, 0x0, + 0x2c8c9d00, 0x0, 0x64e4d700, 0x0, + 0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0, + 0xf845fe24, 0x1, 0x864cfb00, 0x1, + }, + [AV_CRC_32_IEEE] = { + 0x8833794c, 0x0, 0xe6228b11, 0x0, + 0xc5b9cd4c, 0x0, 0xe8a45605, 0x0, + 0x490d678d, 0x0, 0xf200aa66, 0x0, + 0x04d101df, 0x1, 0x04c11db7, 0x1, + }, + [AV_CRC_32_IEEE_LE] = { + 0xc6e41596, 0x1, 0x54442bd4, 0x1, + 0xccaa009e, 0x0, 0x751997d0, 0x1, + 0xccaa009e, 0x0, 0x63cd6124, 0x1, + 0xf7011640, 0x1, 0xdb710641, 0x1, + }, + [AV_CRC_16_ANSI_LE] = { + 0x0000bffa, 0x0, 0x1b0c2, 0x0, + 0x00018cc2, 0x0, 0x1d0c2, 0x0, + 0x00018cc2, 0x0, 0x1bc02, 0x0, + 0xcfffbffe, 0x1, 0x14003, 0x0, + }, +}; + +static uint64_t reverse(uint64_t p, unsigned int deg) +{ + uint64_t ret = 0; + int i; + for (i = 0; i < (deg / 8); i += 1) { + ret = (ret << 8) | (ff_reverse[p & 0xff]); + p >>= 8; + } + int rem = (deg + 1) - 8 * i; + ret = (ret << rem) | (ff_reverse[p & 0xff] >> (8 - rem)); + return ret; +} + +static uint64_t xnmodp(unsigned n, uint64_t poly, unsigned deg, uint64_t *div, int bitreverse) +{ + uint64_t mod, mask, high; + + if (n < deg) { + *div = 0; + return poly; + } + mask = ((uint64_t)1 << deg) - 1; + poly &= mask; + mod = poly; + *div = 1; + deg--; + while (--n > deg) { + high = (mod >> deg) & 1; + *div = (*div << 1) | high; + mod <<= 1; + if (high) + mod ^= poly; + } + uint64_t ret = mod & mask; + if (bitreverse) { + *div = reverse(*div, deg) << 1; + return reverse(ret, deg) << 1; + } + return ret; +} + +static inline void crc_init_x86(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size) +{ + uint64_t poly_; + if (le) { + // convert the reversed representation to regular form + poly = reverse(poly, bits) >> 1; + } + // convert to 32 degree polynomial + poly_ = ((uint64_t)poly) << (32 - bits); + + uint64_t x1, x2, x3, x4, x5, x6, x7, x8, div; + uint8_t *dst = (uint8_t*)ctx; + if (le) { + AV_WN64(dst, xnmodp(4 * 128 - 32, poly_, 32, &div, le)); + AV_WN64(dst + 8, xnmodp(4 * 128 + 32, poly_, 32, &div, le)); + uint64_t tmp = xnmodp(128 - 32, poly_, 32, &div, le); + AV_WN64(dst + 16, tmp); + AV_WN64(dst + 24, xnmodp(128 + 32, poly_, 32, &div, le)); + AV_WN64(dst + 32, tmp); + AV_WN64(dst + 40, xnmodp(64, poly_, 32, &div, le)); + AV_WN64(dst + 48, div); + AV_WN64(dst + 56, reverse(poly_ | (1ULL << 32), 32)); + } else { + AV_WN64(dst, xnmodp(4 * 128 + 64, poly_, 32, &div, le)); + AV_WN64(dst + 8, xnmodp(4 * 128, poly_, 32, &div, le)); + AV_WN64(dst + 16, xnmodp(128 + 64, poly_, 32, &div, le)); + AV_WN64(dst + 24, xnmodp(128, poly_, 32, &div, le)); + AV_WN64(dst + 32, xnmodp(64, poly_, 32, &div, le)); + AV_WN64(dst + 48, div); + AV_WN64(dst + 40, xnmodp(96, poly_, 32, &div, le)); + AV_WN64(dst + 56, poly_ | (1ULL << 32)); + } +} +#endif + +#endif /* AVUTIL_X86_CRC_H */ _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
