This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit dc03cffe9c9577127ef82b6f56118115f900e5f2
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Dec 7 11:46:41 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Jan 4 15:49:30 2026 +0100

    avutil/crc: Use x86 clmul for CRC when available
    
    Observed near 10x speedup on AMD Zen4 7950x:
    av_crc_c:                                            22057.0 ( 1.00x)
    av_crc_clmul:                                         2202.8 (10.01x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/crc.c        | 24 +++++++++++++++++++
 libavutil/x86/Makefile |  1 +
 libavutil/x86/crc.asm  | 11 +++++----
 libavutil/x86/crc.h    | 62 +++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 90 insertions(+), 8 deletions(-)

diff --git a/libavutil/crc.c b/libavutil/crc.c
index 34f507ead8..0b8d66d86d 100644
--- a/libavutil/crc.c
+++ b/libavutil/crc.c
@@ -25,6 +25,9 @@
 #include "bswap.h"
 #include "crc.h"
 #include "error.h"
+#if ARCH_X86
+#include "libavutil/x86/crc.h"
+#endif
 
 #if CONFIG_HARDCODED_TABLES
 static const AVCRC av_crc_table[AV_CRC_MAX][257] = {
@@ -348,6 +351,12 @@ int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t 
poly, int ctx_size)
     if (ctx_size != sizeof(AVCRC) * 257 && ctx_size != sizeof(AVCRC) * 1024)
         return AVERROR(EINVAL);
 
+#if ARCH_X86
+    int done = ff_crc_init_x86(ctx, le, bits, poly, ctx_size);
+    if (done)
+        return 0;
+#endif
+
     for (i = 0; i < 256; i++) {
         if (le) {
             for (c = i, j = 0; j < 8; j++)
@@ -375,6 +384,14 @@ const AVCRC *av_crc_get_table(AVCRCId crc_id)
 {
     if ((unsigned)crc_id >= AV_CRC_MAX)
         return NULL;
+// Check for arch-specific extensions first to avoid initializing
+// ordinary CRC tables unnecessarily.
+#if ARCH_X86
+    const AVCRC *table = ff_crc_get_table_x86(crc_id);
+    if (table)
+        return table;
+#endif
+
 #if !CONFIG_HARDCODED_TABLES
     switch (crc_id) {
     case AV_CRC_8_ATM:      CRC_INIT_TABLE_ONCE(AV_CRC_8_ATM); break;
@@ -394,6 +411,13 @@ const AVCRC *av_crc_get_table(AVCRCId crc_id)
 uint32_t av_crc(const AVCRC *ctx, uint32_t crc,
                 const uint8_t *buffer, size_t length)
 {
+    if (ctx[0]) {
+#if ARCH_X86
+        return ff_crc_x86(ctx, crc, buffer, length);
+#endif
+    }
+    av_assert2(ctx[0] == 0);
+
     const uint8_t *end = buffer + length;
 
 #if !CONFIG_SMALL
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index 4e1b4b1176..901298b6cb 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -4,6 +4,7 @@ 
EMMS_OBJS_$(HAVE_MMX_INLINE)_$(HAVE_MMX_EXTERNAL)_$(HAVE_MM_EMPTY) = x86/emms.o
 
 X86ASM-OBJS += x86/aes.o x86/aes_init.o                                 \
                x86/cpuid.o                                              \
+               x86/crc.o                                                \
                $(EMMS_OBJS__yes_)                                       \
                x86/fixed_dsp.o x86/fixed_dsp_init.o                     \
                x86/float_dsp.o x86/float_dsp_init.o                     \
diff --git a/libavutil/x86/crc.asm b/libavutil/x86/crc.asm
index 95cf90d250..4f5673fbd7 100644
--- a/libavutil/x86/crc.asm
+++ b/libavutil/x86/crc.asm
@@ -138,6 +138,7 @@ SECTION .text
 %endmacro
 
 %macro CRC 1
+%define CTX r0+4
 
;-----------------------------------------------------------------------------------------------
 ; ff_crc[_le]_clmul(const uint8_t *ctx, uint32_t crc, const uint8_t *buffer, 
size_t length
 
;-----------------------------------------------------------------------------------------------
@@ -177,7 +178,7 @@ cglobal crc,    4, 6, 7+4*ARCH_X86_64, 0x10
     mov    r4, 64
     cmp    r3, 128
     jb    .reduce_4x_to_1
-    movu   m4, [r0]
+    movu   m4, [CTX]
 
 .fold_4x_loop:
         movu        m6, [r2 + r4 +  0]
@@ -200,7 +201,7 @@ cglobal crc,    4, 6, 7+4*ARCH_X86_64, 0x10
         jbe        .fold_4x_loop
 
 .reduce_4x_to_1:
-    movu        m4, [r0 + 16]
+    movu        m4, [CTX + 16]
     FOLD_SINGLE m5, m1, m4, m3
     FOLD_SINGLE m5, m1, m4, m2
     FOLD_SINGLE m5, m1, m4, m0
@@ -245,10 +246,10 @@ cglobal crc,    4, 6, 7+4*ARCH_X86_64, 0x10
     FOLD_SINGLE m5, m1, m4, m2
 
 .reduce_128_to_64:
-    movu           m4, [r0 + 32]
+    movu           m4, [CTX + 32]
     FOLD_128_TO_64 %1, m1, m4, m5
 .reduce_64_to_32:
-    movu           m4, [r0 + 48]
+    movu           m4, [CTX + 48]
     FOLD_64_TO_32  %1, m1, m4, m5
     RET
 
@@ -261,7 +262,7 @@ cglobal crc,    4, 6, 7+4*ARCH_X86_64, 0x10
     pshufb m1, m10
 %endif
     mov    r4, 16
-    movu   m4, [r0 + 16]
+    movu   m4, [CTX + 16]
     jmp   .fold_1x_pre
 
 .less_than_16bytes:
diff --git a/libavutil/x86/crc.h b/libavutil/x86/crc.h
index 5fabfa7570..c836c090c6 100644
--- a/libavutil/x86/crc.h
+++ b/libavutil/x86/crc.h
@@ -24,8 +24,10 @@
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/attributes_internal.h"
+#include "libavutil/avassert.h"
 #include "libavutil/cpu.h"
 #include "libavutil/crc.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/reverse.h"
 #include "libavutil/x86/cpu.h"
 
@@ -37,50 +39,64 @@ uint32_t ff_crc_le_clmul(const AVCRC *ctx, uint32_t crc,
                          const uint8_t *buffer, size_t length);
 FF_VISIBILITY_POP_HIDDEN
 
-static const AVCRC crc_table_clmul[AV_CRC_MAX][16] = {
+enum {
+    CRC_C    = 0,
+    CLMUL_BE,
+    CLMUL_LE,
+};
+
+static const AVCRC crc_table_clmul[AV_CRC_MAX][17] = {
     [AV_CRC_8_ATM] = {
+        CLMUL_BE,
         0x32000000, 0x0, 0xbc000000, 0x0,
         0xc4000000, 0x0, 0x94000000, 0x0,
         0x62000000, 0x0, 0x79000000, 0x0,
         0x07156a16, 0x1, 0x07000000, 0x1,
     },
     [AV_CRC_8_EBU] = {
+        CLMUL_BE,
         0xb5000000, 0x0, 0xf3000000, 0x0,
         0xfc000000, 0x0, 0x0d000000, 0x0,
         0x6a000000, 0x0, 0x65000000, 0x0,
         0x1c4b8192, 0x1, 0x1d000000, 0x1,
     },
     [AV_CRC_16_ANSI] = {
+        CLMUL_BE,
         0xf9e30000, 0x0, 0x807d0000, 0x0,
         0xf9130000, 0x0, 0xff830000, 0x0,
         0x807b0000, 0x0, 0x86630000, 0x0,
         0xfffbffe7, 0x1, 0x80050000, 0x1,
     },
     [AV_CRC_16_CCITT] = {
+        CLMUL_BE,
         0x60190000, 0x0, 0x59b00000, 0x0,
         0xd5f60000, 0x0, 0x45630000, 0x0,
         0xaa510000, 0x0, 0xeb230000, 0x0,
         0x11303471, 0x1, 0x10210000, 0x1,
     },
     [AV_CRC_24_IEEE] = {
+        CLMUL_BE,
         0x1f428700, 0x0, 0x467d2400, 0x0,
         0x2c8c9d00, 0x0, 0x64e4d700, 0x0,
         0xd9fe8c00, 0x0, 0xfd7e0c00, 0x0,
         0xf845fe24, 0x1, 0x864cfb00, 0x1,
     },
     [AV_CRC_32_IEEE] = {
+        CLMUL_BE,
         0x8833794c, 0x0, 0xe6228b11, 0x0,
         0xc5b9cd4c, 0x0, 0xe8a45605, 0x0,
         0x490d678d, 0x0, 0xf200aa66, 0x0,
         0x04d101df, 0x1, 0x04c11db7, 0x1,
     },
     [AV_CRC_32_IEEE_LE] = {
+        CLMUL_LE,
         0xc6e41596, 0x1, 0x54442bd4, 0x1,
         0xccaa009e, 0x0, 0x751997d0, 0x1,
         0xccaa009e, 0x0, 0x63cd6124, 0x1,
         0xf7011640, 0x1, 0xdb710641, 0x1,
     },
     [AV_CRC_16_ANSI_LE] = {
+        CLMUL_LE,
         0x0000bffa, 0x0, 0x1b0c2, 0x0,
         0x00018cc2, 0x0, 0x1d0c2, 0x0,
         0x00018cc2, 0x0, 0x1bc02, 0x0,
@@ -139,9 +155,10 @@ static inline void crc_init_x86(AVCRC *ctx, int le, int 
bits, uint32_t poly, int
     // convert to 32 degree polynomial
     poly_ = ((uint64_t)poly) << (32 - bits);
 
-    uint64_t x1, x2, x3, x4, x5, x6, x7, x8, div;
-    uint8_t *dst = (uint8_t*)ctx;
+    uint64_t div;
+    uint8_t *dst = (uint8_t*)(ctx + 1);
     if (le) {
+        ctx[0] = CLMUL_LE;
         AV_WN64(dst,      xnmodp(4 * 128 - 32, poly_, 32, &div, le));
         AV_WN64(dst +  8, xnmodp(4 * 128 + 32, poly_, 32, &div, le));
         uint64_t tmp = xnmodp(128 - 32, poly_, 32, &div, le);
@@ -152,6 +169,7 @@ static inline void crc_init_x86(AVCRC *ctx, int le, int 
bits, uint32_t poly, int
         AV_WN64(dst + 48, div);
         AV_WN64(dst + 56, reverse(poly_ | (1ULL << 32), 32));
     } else {
+        ctx[0] = CLMUL_BE;
         AV_WN64(dst,      xnmodp(4 * 128 + 64, poly_, 32, &div, le));
         AV_WN64(dst +  8, xnmodp(4 * 128, poly_, 32, &div, le));
         AV_WN64(dst + 16, xnmodp(128 + 64, poly_, 32, &div, le));
@@ -164,4 +182,42 @@ static inline void crc_init_x86(AVCRC *ctx, int le, int 
bits, uint32_t poly, int
 }
 #endif
 
+static inline const AVCRC *ff_crc_get_table_x86(AVCRCId crc_id)
+{
+#if HAVE_CLMUL_EXTERNAL
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_CLMUL(cpu_flags)) {
+        return crc_table_clmul[crc_id];
+    }
+#endif
+    return NULL;
+}
+
+static inline av_cold int ff_crc_init_x86(AVCRC *ctx, int le, int bits, 
uint32_t poly, int ctx_size)
+{
+#if HAVE_CLMUL_EXTERNAL
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_CLMUL(cpu_flags)) {
+        crc_init_x86(ctx, le, bits, poly, ctx_size);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+static inline uint32_t ff_crc_x86(const AVCRC *ctx, uint32_t crc,
+                                  const uint8_t *buffer, size_t length)
+{
+    switch (ctx[0]) {
+#if HAVE_CLMUL_EXTERNAL
+    case CLMUL_BE: return ff_crc_clmul(ctx, crc, buffer, length);
+    case CLMUL_LE: return ff_crc_le_clmul(ctx, crc, buffer, length);
+#endif
+    default: av_unreachable("x86 CRC only uses CLMUL_BE and CLMUL_LE");
+    }
+    return 0;
+}
+
 #endif /* AVUTIL_X86_CRC_H */

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to