apv: AVX2 transquant for x86-64

James Almer Thu, 24 Apr 2025 14:41:37 -0700

On 4/24/2025 5:37 PM, Mark Thompson wrote:

On 24/04/2025 03:55, James Almer wrote:

On 4/23/2025 5:45 PM, Mark Thompson wrote:

Typical checkasm result on Alder Lake:


decode_transquant_8_c:                                 464.2 ( 1.00x)
decode_transquant_8_avx2:                               86.2 ( 5.38x)
decode_transquant_10_c:                                481.6 ( 1.00x)
decode_transquant_10_avx2:                              83.5 ( 5.77x)
---
   libavcodec/apv_dsp.c          |   4 +
   libavcodec/apv_dsp.h          |   2 +
   libavcodec/x86/Makefile       |   2 +
   libavcodec/x86/apv_dsp.asm    | 311 ++++++++++++++++++++++++++++++++++
   libavcodec/x86/apv_dsp_init.c |  44 +++++
   tests/checkasm/Makefile       |   1 +
   tests/checkasm/apv_dsp.c      | 109 ++++++++++++
   tests/checkasm/checkasm.c     |   3 +
   tests/checkasm/checkasm.h     |   1 +
   tests/fate/checkasm.mak       |   1 +
   10 files changed, 478 insertions(+)
   create mode 100644 libavcodec/x86/apv_dsp.asm
   create mode 100644 libavcodec/x86/apv_dsp_init.c
   create mode 100644 tests/checkasm/apv_dsp.c

...
diff --git a/libavcodec/x86/apv_dsp.asm b/libavcodec/x86/apv_dsp.asm
new file mode 100644
index 0000000000..12d96481de
--- /dev/null
+++ b/libavcodec/x86/apv_dsp.asm
@@ -0,0 +1,311 @@
+;************************************************************************
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+; Full matrix for row transform.
+const tmatrix_row
+    dw  64,  89,  84,  75,  64,  50,  35,  18
+    dw  64, -18, -84,  50,  64, -75, -35,  89
+    dw  64,  75,  35, -18, -64, -89, -84, -50
+    dw  64, -50, -35,  89, -64, -18,  84, -75
+    dw  64,  50, -35, -89, -64,  18,  84,  75
+    dw  64, -75,  35,  18, -64,  89, -84,  50
+    dw  64,  18, -84, -50,  64,  75, -35, -89
+    dw  64, -89,  84, -75,  64, -50,  35, -18
+
+; Constant pairs for broadcast in column transform.
+const tmatrix_col_even
+    dw  64,  64,  64, -64
+    dw  84,  35,  35, -84
+const tmatrix_col_odd
+    dw  89,  75,  50,  18
+    dw  75, -18, -89, -50
+    dw  50, -89,  18,  75
+    dw  18, -50,  75, -89
+
+; Memory targets for vpbroadcastd (register version requires AVX512).
+cextern pd_1
+const sixtyfour
+    dd  64
+
+SECTION .text
+
+; void ff_apv_decode_transquant_avx2(void *output,
+;                                    ptrdiff_t pitch,
+;                                    const int16_t *input,
+;                                    const int16_t *qmatrix,
+;                                    int bit_depth,
+;                                    int qp_shift);
+
+INIT_YMM avx2
+
+cglobal apv_decode_transquant, 5, 7, 16, output, pitch, input, qmatrix, 
bit_depth, qp_shift, tmp
+
+    ; Load input and dequantise
+
+    vpbroadcastd  m10, [pd_1]
+    lea       tmpd, [bit_depthd - 2]
+    movd      xm8, qp_shiftm
+    movd      xm9, tmpd
+    vpslld    m10, m10, xm9
+    vpsrld    m10, m10, 1
+
+    ; m8  = scalar qp_shift
+    ; m9  = scalar bd_shift
+    ; m10 = vector 1 << (bd_shift - 1)
+    ; m11 = qmatrix load
+
+%macro LOAD_AND_DEQUANT 2 ; (xmm input, constant offset)
+    vpmovsxwd m%1, [inputq   + %2]
+    vpmovsxwd m11, [qmatrixq + %2]
+    vpmaddwd  m%1, m%1, m11
+    vpslld    m%1, m%1, xm8
+    vpaddd    m%1, m%1, m10
+    vpsrad    m%1, m%1, xm9
+    vpackssdw m%1, m%1, m%1
+%endmacro
+
+    LOAD_AND_DEQUANT 0, 0x00
+    LOAD_AND_DEQUANT 1, 0x10
+    LOAD_AND_DEQUANT 2, 0x20
+    LOAD_AND_DEQUANT 3, 0x30
+    LOAD_AND_DEQUANT 4, 0x40
+    LOAD_AND_DEQUANT 5, 0x50
+    LOAD_AND_DEQUANT 6, 0x60
+    LOAD_AND_DEQUANT 7, 0x70
+
+    ; mN = row N words 0 1 2 3 0 1 2 3 4 5 6 7 4 5 6 7
+
+    ; Transform columns
+    ; This applies a 1-D DCT butterfly
+
+    vpunpcklwd  m12, m0,  m4
+    vpunpcklwd  m13, m2,  m6
+    vpunpcklwd  m14, m1,  m3
+    vpunpcklwd  m15, m5,  m7
+
+    ; m12 = rows 0 and 4 interleaved
+    ; m13 = rows 2 and 6 interleaved
+    ; m14 = rows 1 and 3 interleaved
+    ; m15 = rows 5 and 7 interleaved
+
+    lea         tmpq, [tmatrix_col_even]
+    vpbroadcastd   m0, [tmpq + 0x00]
+    vpbroadcastd   m1, [tmpq + 0x04]
+    vpbroadcastd   m2, [tmpq + 0x08]
+    vpbroadcastd   m3, [tmpq + 0x0c]


How about

     vbroadcasti128   m0, [tmatrix_col_even]
     pshufd   m1, m0, q1111
     pshufd   m2, m0, q2222
     pshufd   m3, m0, q3333
     pshufd   m0, m0, q0000

So you remove the lea, and do a single load from memory within a single 
cross-lane intruction, instead of four of each.

Same below.


The broadcasts from memory are not slow, they don't read from either lane.

I can't measure a diffrence but instruction tables have vpbroadcastd as 1/3 and 
pshufd as 1/2 so I think I'll take that as a tie-break?  (lea is free and they 
will all load together, the vbroadcasti128 load is unaligned but pretty sure 
that is irrelevant.)

AVX doesn't care about alignment outside of intructions that are explicit about it (so movdqa/movaps). vbroadcasti128 in any case loads 16 bytes and tmatrix_col_even seems to be 16 byte aligned.

Looking at Skylake and newer, vpbroadcastd has 4 cycle latency and 0.5 throughput, so by the time the results are stored, the pmaddwd will be executed. Meanwhile, vbroadcasti128 has 3 latency, so the pshufd will not execute immediately.

I guess your version may be better.

OpenPGP_signature.asc
Description: OpenPGP digital signature

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v3 5/7] lavc/apv: AVX2 transquant for x86-64

Reply via email to