me_cmp: Improve median_sad functions

Andreas Rheinhardt via ffmpeg-cvslog Fri, 03 Jul 2026 07:58:50 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 061b28fad6e804b75c87ad2b3136827270fe31e6
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Jun 30 14:28:01 2026 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Jul 3 16:09:57 2026 +0200

    avcodec/x86/me_cmp: Improve median_sad functions
    
    The median_abs functions involve computing a predictor
    for a pixel difference; deviations from this predicted
    pixel difference are summed to the result. For the leftmost
    element of a row, the predictor is simply the leftmost
    element of the last row (if any; otherwise zero).
    For the other elements, it is the median of top, left
    and (top + left - topleft).
    
    The current approach to deal with this is to treat
    the first element specially (by using a special accumulator
    just for it and mask at the end). But there is a better way:
    If the left and topleft predictors are zero, the median prediction
    is top and therefore yields the correct result for the leftmost
    element, obviating the need for the special case.
    
    Creating the registers with zeroed left predictors is easy:
    Just shift in the other direction to how it is done now.
    
    Old benchmarks:
      median_sad_0_c:            429.9
      median_sad_0_ssse3:         43.7 ( 9.83x)
      median_sad_1_c:            189.9
      median_sad_1_ssse3:         24.5 ( 7.75x)
    
    New benchmarks:
      median_sad_0_c:            431.1
      median_sad_0_ssse3:         39.7 (10.84x)
      median_sad_1_c:            190.6
      median_sad_1_ssse3:         20.4 ( 9.36x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/me_cmp.asm | 82 ++++++++++++++++++-----------------------------
 1 file changed, 31 insertions(+), 51 deletions(-)

diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 17e8baeba0..5408946245 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -822,7 +822,7 @@ VSAD_APPROX 16, u
 ; the subtraction.  The shifted columns are derived from the unshifted word
 ; vectors, so no out-of-bounds loads are made.
 ; %1: V columns 0-7,  %2: V columns 8-15
-; %3: V columns 1-8,  %4: V columns 9-16 (column 16 is zero)
+; %3: 0w followed by V columns 0-6,  %4: V columns 7-15
 ; %5: scratch register, its contents are irrelevant
 %macro LOAD_V16 5
     movu      %1, [pix1q]
@@ -833,19 +833,19 @@ VSAD_APPROX 16, u
     punpcklbw %3, %5
     psubw     %1, %3            ; V columns 0-7
     psubw     %2, %4            ; V columns 8-15
-    palignr   %3, %2, %1, 2     ; V columns 1-8
-    psrldq    %4, %2, 2         ; V columns 9-16
+    pslldq    %3, %1, 2         ; 0w followed by V columns 0-6
+    palignr   %4, %2, %1, 14    ; V columns 7-14
 %endmacro
 
 ; Same as LOAD_V16 for one row of 8 pixels.
-; %1: V columns 0-7, %2: V columns 1-8 (column 8 is zero), %3: scratch register
+; %1: V columns 0-7, %2: 0w followed by V columns 0-6, %3: scratch register
 %macro LOAD_V8 3
     movq      %1, [pix1q]
     movq      %2, [pix2q]
     punpcklbw %1, %3
     punpcklbw %2, %3
     psubw     %1, %2            ; V columns 0-7
-    psrldq    %2, %1, 2         ; V columns 1-8
+    pslldq    %2, %1, 2         ; 0w, V columns 0-6
 %endmacro
 
 ; Accumulate abs(%5 - mid_pred(%2, %3, %2 + %3 - %4)) into %1, using
@@ -866,30 +866,27 @@ VSAD_APPROX 16, u
 %endmacro
 
 ; Accumulate one row's cost from the previous and current row vectors.
-; %1-%4: previous row V (columns 0-7, 8-15, 1-8, 9-16)
-; %5-%8: current  row V (columns 0-7, 8-15, 1-8, 9-16), loaded here
-; m0-m2 are the accumulators, m11/m12 temporaries, m14 scratch.  The top
+; %1-%4: previous row V (columns 0-7, 8-15, 0-6, 7-14)
+; %5-%8: current  row V (columns 0-7, 8-15, 0-6, 7-14), loaded here
+; m0 is the accumulator, m11/m12 temporaries, m14 scratch.  The top
 ; predictors %3/%4 are consumed by MEDIAN_ABS_ACC, but they belong to the
 ; previous row and are reloaded before being needed again.
 %macro PROCESS_ROW16 8
     LOAD_V16  %5, %6, %7, %8, m14
     add       pix1q, strideq
     add       pix2q, strideq
-    ; column 0: abs(V(0) - V(-stride))
-    psubw     m11, %5, %1
-    pabsw     m11, m11
-    paddw     m2, m11
-    ; columns 1-8 and 9-16
-    MEDIAN_ABS_ACC m0, %3, %5, %1, %7, m11, m12
-    MEDIAN_ABS_ACC m1, %4, %6, %2, %8, m11, m12
+    ; columns 0-7; no special case for the first element lacking
+    ; left and top-left predictors is needed here: The left vectors
+    ; have 0 as first element which leads to the desired result.
+    MEDIAN_ABS_ACC m0, %1, %7, %3, %5, m11, m12
+    ; columns 8-15
+    MEDIAN_ABS_ACC m0, %2, %8, %4, %6, m11, m12
 %endmacro
 
 ; Register layout:
-;   m0  accumulator for columns 1-8
-;   m1  accumulator for columns 9-16 (the last word is discarded at the end)
-;   m2  accumulator for column 0 (only the first word is used)
-;   m3-m6   one row's V (columns 0-7, 8-15, 1-8, 9-16)
-;   m7-m10  the other row's V (columns 0-7, 8-15, 1-8, 9-16)
+;   m0  accumulator
+;   m3-m6   one row's V (columns 0-7, 8-15, 0-6, 7-14)
+;   m7-m10  the other row's V (columns 0-7, 8-15, 0-6, 7-14)
 ;   m11, m12 temporaries
 ;   m14 scratch register for LOAD_V16
 ; The loop is unrolled by two so the two register sets alternate the roles of
@@ -901,11 +898,11 @@ cglobal median_sad16, 5, 5, 15, v, pix1, pix2, stride, h
     add       pix2q, strideq
 
     ; first row: abs(V(0)) + sum of abs(V(j) - V(j-1))
-    pabsw     m2, m3
     psubw     m0, m5, m3
     pabsw     m0, m0
     psubw     m1, m6, m4
     pabsw     m1, m1
+    paddw     m0, m1
 
     sub       hd, 1
     jle       .end
@@ -917,15 +914,8 @@ cglobal median_sad16, 5, 5, 15, v, pix1, pix2, stride, h
     sub       hd, 1
     jg        .loop
 .end:
-    ; column 16 lies outside of the block and column 0 only contributes its
-    ; first word; the kept columns may end up in any lane since the final sum
-    ; is horizontal anyway
-    pslldq    m1, 2
-    pslldq    m2, 14
-    paddw     m0, m1
-    paddw     m0, m2
-    ; the per-word sums are at most 16 * 510, but their total needs more than
-    ; 16 bits: widen to dwords before the horizontal sum
+    ; the per-word sums are at most 2 * 16 * 510, but their total may need
+    ; more than 16 bits: widen to dwords before the horizontal sum
     pxor      m1, m1
     punpckhwd m12, m0, m1
     punpcklwd m0, m1
@@ -939,26 +929,23 @@ INIT_XMM ssse3
 MEDIAN_SAD16
 
 ; Accumulate one row's cost from the previous and current row vectors.
-; %1: previous row V columns 0-7, %2: previous row V columns 1-8
-; %3: current  row V columns 0-7, %4: current  row V columns 1-8 (loaded here)
-; m0/m1 are the accumulators, m7/m8 temporaries, m9 scratch.
+; %1: previous row V columns 0-7, %2: previous row V columns 0-6
+; %3: current  row V columns 0-7, %4: current  row V columns 0-6 (loaded here)
+; m0 is the accumulator, m7/m8 temporaries, m9 scratch.
 %macro PROCESS_ROW8 4
     LOAD_V8   %3, %4, m9
     add       pix1q, strideq
     add       pix2q, strideq
-    ; column 0: abs(V(0) - V(-stride))
-    psubw     m7, %3, %1
-    pabsw     m7, m7
-    paddw     m1, m7
-    ; columns 1-8
-    MEDIAN_ABS_ACC m0, %2, %3, %1, %4, m7, m8
+    ; No special case for the first element lacking left and top-left
+    ; predictors is needed here: The left vectors have 0 as first element
+    ; which leads to the desired result.
+    MEDIAN_ABS_ACC m0, %1, %4, %2, %3, m7, m8
 %endmacro
 
 ; Register layout:
-;   m0  accumulator for columns 1-8 (the last word is discarded at the end)
-;   m1  accumulator for column 0 (only the first word is used)
-;   m2, m3  one row's V (columns 0-7, 1-8)
-;   m5, m6  the other row's V (columns 0-7, 1-8)
+;   m0  accumulator for columns 0-7
+;   m2, m3  one row's V (columns 0-7, 0-6)
+;   m5, m6  the other row's V (columns 0-7, 0-6)
 ;   m7, m8 temporaries
 ;   m9  scratch register for LOAD_V8
 ; As in median_sad16 the loop is unrolled by two so the two register sets
@@ -970,8 +957,7 @@ cglobal median_sad8, 5, 5, 10, v, pix1, pix2, stride, h
     add       pix2q, strideq
 
     ; first row: abs(V(0)) + sum of abs(V(j) - V(j-1))
-    pabsw     m1, m2
-    psubw     m0, m3, m2
+    psubw     m0, m2, m3
     pabsw     m0, m0
 
     sub       hd, 1
@@ -984,12 +970,6 @@ cglobal median_sad8, 5, 5, 10, v, pix1, pix2, stride, h
     sub       hd, 1
     jg        .loop
 .end:
-    ; column 8 lies outside of the block and column 0 only contributes its
-    ; first word; the kept columns may end up in any lane since the final sum
-    ; is horizontal anyway
-    pslldq    m0, 2
-    pslldq    m1, 14
-    paddw     m0, m1
     pxor      m4, m4
     punpckhwd m7, m0, m4
     punpcklwd m0, m4

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 01/04: avcodec/x86/me_cmp: Improve median_sad functions

Reply via email to