The branch, master has been updated
       via  ddf443f1e99c94b5e3569904027eba868691b86b (commit)
       via  f8bcea4946d2ce1f5271a550fcc9131797505eed (commit)
       via  0c556a6b09b43a977e12cab346a55370bb09dd05 (commit)
       via  778ff97efa0f4622136de73cef3b19ed1bcc082a (commit)
       via  f4421457291c1ffa8db328c806d73b844b3e3450 (commit)
       via  c0648b200497f56b611f0be86871269aa073c90b (commit)
       via  06b0dae51bce088ec304771e25ab02c3846e169d (commit)
       via  cc97f1e276241d93139b3ac46a358b7b9f77066b (commit)
       via  3cd452cbf15459334e52c7f2aa92654c822732d5 (commit)
       via  ddd74276f85b3c53809d59ffc640b7b45f5a125f (commit)
       via  68b11cde8212b8ea0309ef6d11b01c782fa0b943 (commit)
       via  63493bf0e0909e701b64392be419f69491b8cbf1 (commit)
       via  66af18d06a3faf9f8960ad6bd5a400701a0cdaab (commit)
       via  1049a5fba8f9437b94050105be8d32545675315e (commit)
       via  d19050a1ae90b4ad8e9e2dadc5c8ca0c39301d69 (commit)
       via  ff85a20b7db4d3226ada8533b181989944f30e75 (commit)
       via  570f8fc6c9850edf6c05d58dea0629f162199f20 (commit)
       via  e042f17e9947779e3b1b981218370472940ca3c6 (commit)
       via  52ba2ac7bd48d09d1f8527376970e2b0e8ee5068 (commit)
       via  70eb8a76a91e9c9fe3a6c0b4f1c2ff28f5447086 (commit)
       via  9f4d5d818d709788ab6b199a634a95a2bfcd4898 (commit)
       via  1699de09551da5efe413637fcb4c90bcaea31b4c (commit)
       via  9b34088c4dfec112170a0a0102acb3be1d77d240 (commit)
      from  57d6898730836ac2006d10bf18396752de092e49 (commit)


- Log -----------------------------------------------------------------
commit ddf443f1e99c94b5e3569904027eba868691b86b
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Nov 13 11:57:02 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100

    avfilter/vf_fsppdsp: Fix left shifts of negative numbers
    
    They are undefined behavior and UBSan warns about them
    (in the checkasm test). Put the shifts in the constants
    instead. This even gives a tiny speedup here:
    
    Old benchmarks:
    column_fidct_c:                                       3369.9 ( 1.00x)
    column_fidct_sse2:                                     829.1 ( 4.06x)
    New benchmarks:
    column_fidct_c:                                       3304.2 ( 1.00x)
    column_fidct_sse2:                                     827.9 ( 3.99x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index 3230376a19..8025e87366 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -165,7 +165,7 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, 
const int16_t *restrict
             d0 = tmp10 + tmp11;
             d4 = tmp10 - tmp11;
 
-            z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+            z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
             d2 = tmp13 + z1;
             d6 = tmp13 - z1;
 
@@ -193,10 +193,10 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, 
const int16_t *restrict
             tmp11 = tmp5 + tmp6;
             tmp12 = tmp6 + tmp7;
 
-            z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
-            z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
-            z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
-            z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
+            z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
+            z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5;
+            z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5;
+            z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2);
 
             z11 = tmp7 + z3;
             z13 = tmp7 - z3;
@@ -215,15 +215,15 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, 
const int16_t *restrict
 
             //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
             z13 = tmp6 + tmp5;
-            z10 = (tmp6 - tmp5) << 1;
+            z10 = (tmp6 - tmp5) * 2;
             z11 = tmp4 + tmp7;
-            z12 = (tmp4 - tmp7) << 1;
+            z12 = (tmp4 - tmp7) * 2;
 
             tmp7  = (z11 + z13) >> 2; //+2 !
-            tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
-            z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
-            tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
-            tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - 
!!
+            tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562 << 1);
+            z5    = MULTIPLY16H(z10 + z12, FIX_1_847759065);
+            tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
+            tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - !!
 
             tmp6 = tmp12 - tmp7;
             tmp5 = tmp11 - tmp6;
@@ -264,7 +264,7 @@ void ff_row_idct_c(const int16_t *restrict wsptr, int16_t 
*restrict output_adr,
         tmp11 = wsptr[2] -  wsptr[3];
 
         tmp13 = wsptr[0] +  wsptr[1];
-        tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - 
tmp13;//this shift order to avoid overflow
+        tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) * 4) - 
tmp13;//this shift order to avoid overflow
 
         tmp0 = tmp10 + tmp13; //->temps
         tmp3 = tmp10 - tmp13; //->temps
@@ -289,9 +289,9 @@ void ff_row_idct_c(const int16_t *restrict wsptr, int16_t 
*restrict output_adr,
         tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
         tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
 
-        tmp6 = (tmp12 << 3) - tmp7;
-        tmp5 = (tmp11 << 3) - tmp6;
-        tmp4 = (tmp10 << 3) + tmp5;
+        tmp6 = tmp12 * 8 - tmp7;
+        tmp5 = tmp11 * 8 - tmp6;
+        tmp4 = tmp10 * 8 + tmp5;
 
         // Final output stage: descale and write column
         outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
@@ -342,20 +342,20 @@ void ff_row_fdct_c(int16_t *restrict data, const uint8_t 
*restrict pixels,
         dataptr[2] = tmp10 + tmp11;
         dataptr[3] = tmp10 - tmp11;
 
-        z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+        z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
         dataptr[0] = tmp13 + z1;
         dataptr[1] = tmp13 - z1;
 
         // Odd part
 
-        tmp10 = (tmp4 + tmp5) << 2;
-        tmp11 = (tmp5 + tmp6) << 2;
-        tmp12 = (tmp6 + tmp7) << 2;
+        tmp10 = tmp4 + tmp5;
+        tmp11 = tmp5 + tmp6;
+        tmp12 = tmp6 + tmp7;
 
-        z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
-        z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
-        z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
-        z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
+        z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
+        z2 = MULTIPLY16H(tmp10,         FIX_0_541196100 << 2) + z5;
+        z4 = MULTIPLY16H(tmp12,         FIX_1_306562965 << 2) + z5;
+        z3 = MULTIPLY16H(tmp11,         FIX_0_707106781 << 2);
 
         z11 = tmp7 + z3;
         z13 = tmp7 - z3;

commit f8bcea4946d2ce1f5271a550fcc9131797505eed
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Nov 13 12:04:15 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100

    avfilter/vf_fsppdsp: Remove pointless cast
    
    Also don't cast const away and use a smaller scope.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index 7fdc5ece25..3230376a19 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -136,12 +136,11 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, 
const int16_t *restrict
     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
 
     int16_t *wsptr;
-    int16_t *threshold;
 
     wsptr = output;
 
     for (; cnt > 0; cnt -= 2) { //start positions
-        threshold = (int16_t *)thr_adr;//threshold_mtx
+        const int16_t *threshold = thr_adr;//threshold_mtx
         for (int ctr = DCTSIZE; ctr > 0; ctr--) {
             // Process columns from input, add to output.
             tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7];

commit 0c556a6b09b43a977e12cab346a55370bb09dd05
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Nov 13 11:18:28 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100

    avfilter/vf_fspp: Pre-reorder threshold table
    
    Avoids reordering at runtime.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 9e5c688fb2..cbf2e06d67 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -92,14 +92,16 @@ static const short custom_threshold[64] = {
 // values (296) can't be too high
 // -it causes too big quant dependence
 // or maybe overflow(check), which results in some flashing
-     71, 296, 295, 237,  71,  40,  38,  19,
-    245, 193, 185, 121, 102,  73,  53,  27,
-    158, 129, 141, 107,  97,  73,  50,  26,
-    102, 116, 109,  98,  82,  66,  45,  23,
-     71,  94,  95,  81,  70,  56,  38,  20,
-     56,  77,  74,  66,  56,  44,  30,  15,
-     38,  53,  50,  45,  38,  30,  21,  11,
-     20,  27,  26,  23,  20,  15,  11,   5
+// reorder coefficients to the order in which columns are processed
+#define REORDER(a,b,c,d,e,f,g,h) c, g, a, e, f, d, b, h
+    REORDER( 71, 296, 295, 237,  71,  40,  38,  19),
+    REORDER(245, 193, 185, 121, 102,  73,  53,  27),
+    REORDER(158, 129, 141, 107,  97,  73,  50,  26),
+    REORDER(102, 116, 109,  98,  82,  66,  45,  23),
+    REORDER( 71,  94,  95,  81,  70,  56,  38,  20),
+    REORDER( 56,  77,  74,  66,  56,  44,  30,  15),
+    REORDER( 38,  53,  50,  45,  38,  30,  21,  11),
+    REORDER( 20,  27,  26,  23,  20,  15,  11,   5)
 };
 
 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
@@ -244,25 +246,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 
     int qp_stride = 0;
     int8_t *qp_table = NULL;
-    int i, bias;
     int ret = 0;
-    int custom_threshold_m[64];
-
-    bias = (1 << 4) + fspp->strength;
-
-    for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
-        custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 
0.5);
-
-    for (int i = 0; i < 64; i += 8) {
-        fspp->threshold_mtx_noq[i + 0] = custom_threshold_m[i + 2];
-        fspp->threshold_mtx_noq[i + 1] = custom_threshold_m[i + 6];
-        fspp->threshold_mtx_noq[i + 2] = custom_threshold_m[i + 0];
-        fspp->threshold_mtx_noq[i + 3] = custom_threshold_m[i + 4];
-        fspp->threshold_mtx_noq[i + 4] = custom_threshold_m[i + 5];
-        fspp->threshold_mtx_noq[i + 5] = custom_threshold_m[i + 3];
-        fspp->threshold_mtx_noq[i + 6] = custom_threshold_m[i + 1];
-        fspp->threshold_mtx_noq[i + 7] = custom_threshold_m[i + 7];
-    }
+
+    //FIXME: tune custom_threshold[] and remove this !
+    for (int i = 0, bias = (1 << 4) + fspp->strength; i < 64; ++i)
+        fspp->threshold_mtx_noq[i] = (int)(custom_threshold[i] * (bias / 71.0) 
+ 0.5);
 
     if (fspp->qp) {
         fspp->prev_q = fspp->qp;

commit 778ff97efa0f4622136de73cef3b19ed1bcc082a
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Nov 13 11:02:56 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100

    avfilter/vf_fspp: Make output endian-independent
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 670e9288d9..9e5c688fb2 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -37,7 +37,6 @@
 
 #include "libavutil/emms.h"
 #include "libavutil/imgutils.h"
-#include "libavutil/intreadwrite.h"
 #include "libavutil/mem.h"
 #include "libavutil/mem_internal.h"
 #include "libavutil/opt.h"
@@ -254,16 +253,15 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 
0.5);
 
-    for (i = 0; i < 8; i++) {
-        AV_WN64A(&fspp->threshold_mtx_noq[8 * i], 
(uint64_t)custom_threshold_m[i * 8 + 2]
-                                      |(((uint64_t)custom_threshold_m[i * 8 + 
6]) << 16)
-                                      |(((uint64_t)custom_threshold_m[i * 8 + 
0]) << 32)
-                                      |(((uint64_t)custom_threshold_m[i * 8 + 
4]) << 48));
-
-        AV_WN64A(&fspp->threshold_mtx_noq[8 * i + 4], 
(uint64_t)custom_threshold_m[i * 8 + 5]
-                                          |(((uint64_t)custom_threshold_m[i * 
8 + 3]) << 16)
-                                          |(((uint64_t)custom_threshold_m[i * 
8 + 1]) << 32)
-                                          |(((uint64_t)custom_threshold_m[i * 
8 + 7]) << 48));
+    for (int i = 0; i < 64; i += 8) {
+        fspp->threshold_mtx_noq[i + 0] = custom_threshold_m[i + 2];
+        fspp->threshold_mtx_noq[i + 1] = custom_threshold_m[i + 6];
+        fspp->threshold_mtx_noq[i + 2] = custom_threshold_m[i + 0];
+        fspp->threshold_mtx_noq[i + 3] = custom_threshold_m[i + 4];
+        fspp->threshold_mtx_noq[i + 4] = custom_threshold_m[i + 5];
+        fspp->threshold_mtx_noq[i + 5] = custom_threshold_m[i + 3];
+        fspp->threshold_mtx_noq[i + 6] = custom_threshold_m[i + 1];
+        fspp->threshold_mtx_noq[i + 7] = custom_threshold_m[i + 7];
     }
 
     if (fspp->qp) {

commit f4421457291c1ffa8db328c806d73b844b3e3450
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Nov 13 10:48:23 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100

    avfilter/vf_fspp: Avoid casts, effective-type violations
    
    Maybe uint64_t has been used as a poor man's alignment specifier?
    Anyway, reading an uint64_t via an lvalue of type int16_t (as happens
    in the C versions of the dsp functions) is undefined behavior.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 3db7fe114e..670e9288d9 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -37,6 +37,7 @@
 
 #include "libavutil/emms.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/mem.h"
 #include "libavutil/mem_internal.h"
 #include "libavutil/opt.h"
@@ -71,8 +72,8 @@ typedef struct FSPPContext {
 
     FSPPDSPContext dsp;
 
-    DECLARE_ALIGNED(16, uint64_t, threshold_mtx_noq)[8 * 2];
-    DECLARE_ALIGNED(16, uint64_t, threshold_mtx)[8 * 2];
+    DECLARE_ALIGNED(16, int16_t, threshold_mtx_noq)[8 * 8];
+    DECLARE_ALIGNED(16, int16_t, threshold_mtx)[8 * 8];
 } FSPPContext;
 
 
@@ -154,7 +155,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t 
*src,
             p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - 
(y&1), stride, 2 * (BLOCKSZ - 1));
 
             if (p->qp)
-                p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 
0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
+                p->dsp.column_fidct(p->threshold_mtx, block + 0 * 8, block3 + 
0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
             else
                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
                     t = x + x0 - 2;                    //correct 
t=x+x0-2-(y&1), but its the same
@@ -164,8 +165,11 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t 
*src,
                     t = qp_store[qy + (t >> qpsh)];
                     t = ff_norm_qscale(t, p->qscale_type);
 
-                    if (t != p->prev_q) p->prev_q = t, 
p->dsp.mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t 
*)(&p->threshold_mtx[0]), t);
-                    p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), 
block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
+                    if (t != p->prev_q) {
+                        p->prev_q = t;
+                        p->dsp.mul_thrmat(p->threshold_mtx_noq, 
p->threshold_mtx, t);
+                    }
+                    p->dsp.column_fidct(p->threshold_mtx, block + x * 8, 
block3 + x * 8, 8); //yes, this is a HOTSPOT
                 }
             p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 
2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * 
sizeof(int16_t)); //cycling
@@ -176,7 +180,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t 
*src,
         if (es > 8)
             p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - 
(y & 1), stride, (es - 4) >> 2);
 
-        p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, 
es&(~1));
+        p->dsp.column_fidct(p->threshold_mtx, block, block3, es&(~1));
         if (es > 3)
             p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 
2 - (y & 1), stride, es >> 2);
 
@@ -251,19 +255,21 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 
0.5);
 
     for (i = 0; i < 8; i++) {
-        fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 
2]
+        AV_WN64A(&fspp->threshold_mtx_noq[8 * i], 
(uint64_t)custom_threshold_m[i * 8 + 2]
                                       |(((uint64_t)custom_threshold_m[i * 8 + 
6]) << 16)
                                       |(((uint64_t)custom_threshold_m[i * 8 + 
0]) << 32)
-                                      |(((uint64_t)custom_threshold_m[i * 8 + 
4]) << 48);
+                                      |(((uint64_t)custom_threshold_m[i * 8 + 
4]) << 48));
 
-        fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 
8 + 5]
+        AV_WN64A(&fspp->threshold_mtx_noq[8 * i + 4], 
(uint64_t)custom_threshold_m[i * 8 + 5]
                                           |(((uint64_t)custom_threshold_m[i * 
8 + 3]) << 16)
                                           |(((uint64_t)custom_threshold_m[i * 
8 + 1]) << 32)
-                                          |(((uint64_t)custom_threshold_m[i * 
8 + 7]) << 48);
+                                          |(((uint64_t)custom_threshold_m[i * 
8 + 7]) << 48));
     }
 
-    if (fspp->qp)
-        fspp->prev_q = fspp->qp, fspp->dsp.mul_thrmat((int16_t 
*)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), 
fspp->qp);
+    if (fspp->qp) {
+        fspp->prev_q = fspp->qp;
+        fspp->dsp.mul_thrmat(fspp->threshold_mtx_noq, fspp->threshold_mtx, 
fspp->qp);
+    }
 
     /* if we are not in a constant user quantizer mode and we don't want to use
      * the quantizers from the B-frames (B-frames often have a higher QP), we

commit c0648b200497f56b611f0be86871269aa073c90b
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 23:26:04 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100

    avfilter/x86/vf_spp: Fix comment
    
    Forgotten in dcb28ed860166c9715afb7c71c70889e6b9b8c8d.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c
index 48c3d25d7c..7dcf18ec7d 100644
--- a/libavfilter/x86/vf_spp.c
+++ b/libavfilter/x86/vf_spp.c
@@ -64,7 +64,7 @@ static void store_slice_sse2(uint8_t *dst, const int16_t *src,
     }
 }
 
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_SSE2_INLINE */
 
 av_cold void ff_spp_init_x86(SPPContext *s)
 {

commit 06b0dae51bce088ec304771e25ab02c3846e169d
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 23:15:24 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100

    avfilter/vf_fsppdsp: Constify
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index e530bcd06b..7fdc5ece25 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -121,13 +121,13 @@ void ff_store_slice2_c(uint8_t *restrict dst, int16_t 
*restrict src,
     }
 }
 
-void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, 
int q)
+void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict 
thr_adr, int q)
 {
     for (int a = 0; a < 64; a++)
         thr_adr[a] = q * thr_adr_noq[a];
 }
 
-void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t 
*restrict data,
                        int16_t *restrict output, int cnt)
 {
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
@@ -135,28 +135,26 @@ void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t 
*restrict data,
     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
 
-    int16_t *dataptr;
     int16_t *wsptr;
     int16_t *threshold;
 
-    dataptr = data;
     wsptr = output;
 
     for (; cnt > 0; cnt -= 2) { //start positions
         threshold = (int16_t *)thr_adr;//threshold_mtx
         for (int ctr = DCTSIZE; ctr > 0; ctr--) {
             // Process columns from input, add to output.
-            tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
-            tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+            tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7];
+            tmp7 = data[DCTSIZE * 0] - data[DCTSIZE * 7];
 
-            tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
-            tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+            tmp1 = data[DCTSIZE * 1] + data[DCTSIZE * 6];
+            tmp6 = data[DCTSIZE * 1] - data[DCTSIZE * 6];
 
-            tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
-            tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+            tmp2 = data[DCTSIZE * 2] + data[DCTSIZE * 5];
+            tmp5 = data[DCTSIZE * 2] - data[DCTSIZE * 5];
 
-            tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
-            tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+            tmp3 = data[DCTSIZE * 3] + data[DCTSIZE * 4];
+            tmp4 = data[DCTSIZE * 3] - data[DCTSIZE * 4];
 
             // Even part of FDCT
 
@@ -241,26 +239,24 @@ void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t 
*restrict data,
             wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
             wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
             //
-            dataptr++; //next column
+            data++; //next column
             wsptr++;
             threshold++;
         }
-        dataptr += 8; //skip each second start pos
+        data  += 8; //skip each second start pos
         wsptr   += 8;
     }
 }
 
-void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr,
                    ptrdiff_t output_stride, int cnt)
 {
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     int_simd16_t tmp10, tmp11, tmp12, tmp13;
     int_simd16_t z5, z10, z11, z12, z13;
     int16_t *outptr;
-    int16_t *wsptr;
 
     cnt *= 4;
-    wsptr = workspace;
     outptr = output_adr;
     for (; cnt > 0; cnt--) {
         // Even part
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index 66030da4b1..5a2f1af030 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -39,13 +39,13 @@ typedef struct FSPPDSPContext {
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 
-    void (*mul_thrmat)(int16_t *restrict thr_adr_noq /* align 16 */,
+    void (*mul_thrmat)(const int16_t *restrict thr_adr_noq /* align 16 */,
                        int16_t *restrict thr_adr /* align 16 */, int q);
 
-    void (*column_fidct)(int16_t *restrict thr_adr, int16_t *data,
+    void (*column_fidct)(const int16_t *restrict thr_adr, const int16_t 
*restrict data,
                          int16_t *restrict output, int cnt);
 
-    void (*row_idct)(int16_t *restrict workspace, int16_t *restrict output_adr,
+    void (*row_idct)(const int16_t *restrict workspace, int16_t *restrict 
output_adr,
                      ptrdiff_t output_stride, int cnt);
 
     void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels,
@@ -61,10 +61,10 @@ void ff_store_slice_c(uint8_t *restrict dst, int16_t 
*restrict src,
 void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
-void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, 
int q);
-void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict 
thr_adr, int q);
+void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t 
*restrict data,
                        int16_t *restrict output, int cnt);
-void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+void ff_row_idct_c(const int16_t *restrict workspace, int16_t *restrict 
output_adr,
                    ptrdiff_t output_stride, int cnt);
 void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
                    ptrdiff_t line_size, int cnt);
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index c7a9b1799e..caf94b30d6 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -29,9 +29,9 @@ void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
 void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
-void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt);
-void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
+void ff_mul_thrmat_sse2(const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_sse2(const int16_t *thr_adr, const int16_t *data, int16_t 
*output, int cnt);
+void ff_row_idct_mmx(const int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
 void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t 
line_size, int cnt);
 
 av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index ccb9cd1e7d..496e859fe0 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -116,7 +116,7 @@ static void check_mul_thrmat(void)
     DECLARE_ALIGNED(16, int16_t, dst_ref)[64];
     DECLARE_ALIGNED(16, int16_t, dst_new)[64];
     const int q = (uint8_t)rnd();
-    declare_func(void, int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+    declare_func(void, const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
 
     ff_fsppdsp_init(&fspp);
 
@@ -136,7 +136,7 @@ static void check_column_fidct(void)
         NB_BLOCKS = 8, ///< arbitrary
     };
     FSPPDSPContext fspp;
-    declare_func(void, int16_t *thr_adr, int16_t *data,
+    declare_func(void, const int16_t *thr_adr, const int16_t *data,
                        int16_t *output, int cnt);
 
     ff_fsppdsp_init(&fspp);

commit cc97f1e276241d93139b3ac46a358b7b9f77066b
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 23:05:30 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100

    avfilter/vf_fspp: Fix effective type violation
    
    Also don't use unnecessarily large alignment; it avoids having to align
    the stack.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index fa562cbd45..3db7fe114e 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -114,9 +114,9 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t 
*src,
     const int qpsh = 4 - p->hsub * !is_luma;
     const int qpsv = 4 - p->vsub * !is_luma;
 
-    DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * 
BLOCKSZ];
-    int16_t *block  = (int16_t *)block_align;
-    int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
+    DECLARE_ALIGNED(16, int16_t, block_align)[8 * 8 * BLOCKSZ + 8 * 8 * 
BLOCKSZ];
+    int16_t *block  = block_align;
+    int16_t *block3 = block_align + 8 * 8 * BLOCKSZ;
 
     memset(block3, 0, 4 * 8 * BLOCKSZ);
 

commit 3cd452cbf15459334e52c7f2aa92654c822732d5
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 22:44:28 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100

    avfilter/x86/vf_fspp: Avoid stack on x64
    
    Possible due to the amount of registers.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 3f37911722..cad44ed0bf 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -210,35 +210,47 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m6, m2
     psubw     m7, m2
     mova      m2, m5
+%if ARCH_X86_64
+    mova      m8, [thrq]
+%define THRQ m8
+%else
+%define THRQ [thrq]
+%endif
     paddw     m5, m6
     psubw     m2, m6
     paddw     m7, m1
     mova      m6, [thrq+4*16]
     psllw     m7, 1
-    psubw     m5, [thrq]
+    psubw     m5, THRQ
     psubw     m2, m6
-    paddusw   m5, [thrq]
+    paddusw   m5, THRQ
     paddusw   m2, m6
-    pmulhw    m7, [pw_5A82]
-    paddw     m5, [thrq]
+    pmulhw    m7, SQRT2
+    paddw     m5, THRQ
     paddw     m2, m6
-    psubusw   m5, [thrq]
+    psubusw   m5, THRQ
     psubusw   m2, m6
     paddw     m5, [pw_2]
     mova      m6, m2
     paddw     m2, m5
+%if ARCH_X86_64
+    mova      m8, [thrq+2*16]
+%define THRQ m8
+%else
+%define THRQ [thrq+2*16]
+%endif
     psubw     m5, m6
     mova      m6, m1
     paddw     m1, m7
-    psubw     m1, [thrq+2*16]
+    psubw     m1, THRQ
     psubw     m6, m7
     mova      m7, [thrq+6*16]
     psraw     m5, 2
-    paddusw   m1, [thrq+2*16]
+    paddusw   m1, THRQ
     psubw     m6, m7
-    paddw     m1, [thrq+2*16]
+    paddw     m1, THRQ
     paddusw   m6, m7
-    psubusw   m1, [thrq+2*16]
+    psubusw   m1, THRQ
     paddw     m6, m7
     psubw     m3, [srcq+DCTSIZE*4*2]
     psubusw   m6, m7
@@ -250,15 +262,15 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m6, m7
     psraw     m6, 2
     mova      m7, m2
-    pmulhw    m1, [pw_5A82]
+    pmulhw    m1, SQRT2
     paddw     m2, m6
-    mova   [rsp], m2
+    mova    tmp0, m2
     psubw     m7, m6
     mova      m2, [srcq+DCTSIZE*2*2]
     psubw     m1, m6
     psubw     m2, [srcq+DCTSIZE*5*2]
     mova      m6, m5
-    mova      [rsp+16*3], m7
+    mova    tmp3, m7
     paddw     m3, m2
     paddw     m2, m4
     paddw     m4, m0
@@ -272,14 +284,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     psllw     m2, 1
     pmulhw    m4, [pw_539F]
     paddw     m5, m1
-    pmulhw    m2, [pw_5A82]
+    pmulhw    m2, SQRT2
     psubw     m6, m1
     paddw     m7, m3
-    mova      [rsp+16], m5
+    mova    tmp1, m5
     paddw     m4, m3
     mova      m3, [thrq+3*16]
     mova      m1, m0
-    mova      [rsp+16*2], m6
+    mova    tmp2, m6
     psubw     m1, m2
     paddw     m0, m2
     mova      m5, m1
@@ -319,14 +331,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
 %endif
     or      tmpq, tmpq
     jnz %1
-    mova      m4, [rsp]
+    mova      m4, tmp0
     psraw     m3, m0, 2
     mova      m5, [outq+DCTSIZE*0*2]
     pmulhw    m1, m0, [pw_7642]
     pmulhw    m2, m0, [pw_4546]
-    pmulhw    m0, [pw_5A82]
+    pmulhw    m0, SQRT2
     paddw     m5, m4
-    mova      m6, [rsp+16]
+    mova      m6, tmp1
     psubw     m2, m1
     psubw     m4, m3
     mova      m7, [outq+DCTSIZE*1*2]
@@ -337,7 +349,7 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m2, m0
     mova      [outq+DCTSIZE*0*2], m5
     paddw     m7, m6
-    mova      m3, [rsp+16*2]
+    mova      m3, tmp2
     psubw     m6, m1
     mova      m4, [outq+DCTSIZE*2*2]
     paddw     m7, m1
@@ -349,7 +361,7 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m4, m0
     mova      m6, [outq+DCTSIZE*3*2]
     paddw     m5, m3
-    mova      m0, [rsp+16*3]
+    mova      m0, tmp3
     mova      [outq+DCTSIZE*1*2], m7
     paddw     m6, m0
     mova      [outq+DCTSIZE*2*2], m4
@@ -376,23 +388,23 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m2, m6
     pmulhw    m0, [pw_4546]
     mova      m7, m2
-    mova      m4, [rsp]
+    mova      m4, tmp0
     psubw     m2, m3
     paddw     m7, m3
-    pmulhw    m2, [pw_5A82]
+    pmulhw    m2, SQRT2
     mova      m6, m4
     psraw     m7, 2
     paddw     m4, [outq]
     psubw     m6, m7
-    mova      m3, [rsp+16]
+    mova      m3, tmp1
     paddw     m4, m7
     mova      [outq+DCTSIZE*7*2], m6
     paddw     m1, m5
     mova  [outq], m4
     psubw     m1, m7
-    mova      m7, [rsp+16*2]
+    mova      m7, tmp2
     psubw     m0, m5
-    mova      m6, [rsp+16*3]
+    mova      m6, tmp3
     mova      m5, m3
     paddw     m3, [outq+DCTSIZE*1*2]
     psubw     m5, m1
@@ -419,7 +431,21 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
 %endmacro
 
 ;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt);
-cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp
+cglobal column_fidct, 4, 5, 8+5*ARCH_X86_64, 64*!ARCH_X86_64, thr, src, out, 
cnt, tmp
+%if ARCH_X86_64
+    %define tmp0 m8
+    %define tmp1 m9
+    %define tmp2 m10
+    %define tmp3 m11
+    %define SQRT2 m12
+    mova     m12, [pw_5A82]
+%else
+    %define tmp0 [rsp]
+    %define tmp1 [rsp+16]
+    %define tmp2 [rsp+2*16]
+    %define tmp3 [rsp+3*16]
+    %define SQRT2 [pw_5A82]
+%endif
 .fdct:
     COLUMN_FDCT .idct
     sub    cntd, 2

commit ddd74276f85b3c53809d59ffc640b7b45f5a125f
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 21:42:32 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:11 2025 +0100

    avfilter/x86/vf_fspp: Port ff_column_fidct_mmx() to SSE2
    
    It gains a lot because it has to operate on eight words;
    it also saves 608B of .text here.
    
    Old benchmarks:
    column_fidct_c:                                       3365.7 ( 1.00x)
    column_fidct_mmx:                                     1784.6 ( 1.89x)
    
    New benchmarks:
    column_fidct_c:                                       3361.5 ( 1.00x)
    column_fidct_sse2:                                     801.1 ( 4.20x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index f61efc99f8..3f37911722 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -26,18 +26,18 @@
 SECTION_RODATA
 
 cextern fspp_dither
+pw_4546: times 8 dw 0x4546 ; FIX(1.082392200, 13)*2
+pw_61F8: times 8 dw 0x61F8 ; FIX(0.382683433, 14)*4
+pw_539F: times 8 dw 0x539F ; FIX(1.306562965, 14)
+pw_5A82: times 8 dw 0x5A82 ; FIX(1.414213562, 14)
+pw_7642: times 8 dw 0x7642 ; FIX(1.847759065, 13)*2
+pw_AC62: times 8 dw 0xAC62 ; FIX(-2.613125930, 13)
+pw_2:    times 8 dw 2
 pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
-pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14)
 pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
-pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13)
 pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
-pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
-pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
-pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13)
 pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
-pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
 pw_4:    times 4 dw 4
-pw_2:    times 4 dw 2
 
 SECTION .text
 
@@ -191,82 +191,83 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     mova      [thrq+16*7], m3
     RET
 
-%macro COLUMN_FDCT 1-3 0, 0
-    movq      m1, [srcq+DCTSIZE*0*2]
-    movq      m7, [srcq+DCTSIZE*3*2]
-    movq      m0, m1
+%macro COLUMN_FDCT 1
+    mova      m1, [srcq+DCTSIZE*0*2]
+    mova      m7, [srcq+DCTSIZE*3*2]
+    mova      m0, m1
     paddw     m1, [srcq+DCTSIZE*7*2]
-    movq      m3, m7
+    mova      m3, m7
     paddw     m7, [srcq+DCTSIZE*4*2]
-    movq      m5, m1
-    movq      m6, [srcq+DCTSIZE*1*2]
+    mova      m5, m1
+    mova      m6, [srcq+DCTSIZE*1*2]
     psubw     m1, m7
-    movq      m2, [srcq+DCTSIZE*2*2]
-    movq      m4, m6
+    mova      m2, [srcq+DCTSIZE*2*2]
+    mova      m4, m6
     paddw     m6, [srcq+DCTSIZE*6*2]
     paddw     m5, m7
     paddw     m2, [srcq+DCTSIZE*5*2]
-    movq      m7, m6
+    mova      m7, m6
     paddw     m6, m2
     psubw     m7, m2
-    movq      m2, m5
+    mova      m2, m5
     paddw     m5, m6
     psubw     m2, m6
     paddw     m7, m1
-    movq      m6, [thrq+4*16+%2]
+    mova      m6, [thrq+4*16]
     psllw     m7, 1
-    psubw     m5, [thrq+%2]
+    psubw     m5, [thrq]
     psubw     m2, m6
-    paddusw   m5, [thrq+%2]
+    paddusw   m5, [thrq]
     paddusw   m2, m6
     pmulhw    m7, [pw_5A82]
-    paddw     m5, [thrq+%2]
+    paddw     m5, [thrq]
     paddw     m2, m6
-    psubusw   m5, [thrq+%2]
+    psubusw   m5, [thrq]
     psubusw   m2, m6
     paddw     m5, [pw_2]
-    movq      m6, m2
+    mova      m6, m2
     paddw     m2, m5
     psubw     m5, m6
-    movq      m6, m1
+    mova      m6, m1
     paddw     m1, m7
-    psubw     m1, [thrq+2*16+%2]
+    psubw     m1, [thrq+2*16]
     psubw     m6, m7
-    movq      m7, [thrq+6*16+%2]
+    mova      m7, [thrq+6*16]
     psraw     m5, 2
-    paddusw   m1, [thrq+2*16+%2]
+    paddusw   m1, [thrq+2*16]
     psubw     m6, m7
-    paddw     m1, [thrq+2*16+%2]
+    paddw     m1, [thrq+2*16]
     paddusw   m6, m7
-    psubusw   m1, [thrq+2*16+%2]
+    psubusw   m1, [thrq+2*16]
     paddw     m6, m7
     psubw     m3, [srcq+DCTSIZE*4*2]
     psubusw   m6, m7
-    movq      m7, m1
+    mova      m7, m1
     psraw     m2, 2
     psubw     m4, [srcq+DCTSIZE*6*2]
     psubw     m1, m6
     psubw     m0, [srcq+DCTSIZE*7*2]
     paddw     m6, m7
     psraw     m6, 2
-    movq      m7, m2
+    mova      m7, m2
     pmulhw    m1, [pw_5A82]
     paddw     m2, m6
-    movq      [rsp], m2
+    mova   [rsp], m2
     psubw     m7, m6
-    movq      m2, [srcq+DCTSIZE*2*2]
+    mova      m2, [srcq+DCTSIZE*2*2]
     psubw     m1, m6
     psubw     m2, [srcq+DCTSIZE*5*2]
-    movq      m6, m5
-    movq      [rsp+8*3], m7
+    mova      m6, m5
+    mova      [rsp+16*3], m7
     paddw     m3, m2
     paddw     m2, m4
     paddw     m4, m0
-    movq      m7, m3
+    mova      m7, m3
     psubw     m3, m4
     psllw     m7, 1
     pmulhw    m3, [pw_61F8]
     psllw     m4, 2
+    add     srcq, 32
     pmulhw    m7, [pw_4546]
     psllw     m2, 1
     pmulhw    m4, [pw_539F]
@@ -274,25 +275,25 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     pmulhw    m2, [pw_5A82]
     psubw     m6, m1
     paddw     m7, m3
-    movq      [rsp+8], m5
+    mova      [rsp+16], m5
     paddw     m4, m3
-    movq      m3, [thrq+3*16+%2]
-    movq      m1, m0
-    movq      [rsp+8*2], m6
+    mova      m3, [thrq+3*16]
+    mova      m1, m0
+    mova      [rsp+16*2], m6
     psubw     m1, m2
     paddw     m0, m2
-    movq      m5, m1
-    movq      m2, [thrq+5*16+%2]
+    mova      m5, m1
+    mova      m2, [thrq+5*16]
     psubw     m1, m7
     paddw     m5, m7
     psubw     m1, m3
-    movq      m7, [thrq+16+%2]
+    mova      m7, [thrq+16]
     psubw     m5, m2
-    movq      m6, m0
+    mova      m6, m0
     paddw     m0, m4
     paddusw   m1, m3
     psubw     m6, m4
-    movq      m4, [thrq+7*16+%2]
+    mova      m4, [thrq+7*16]
     psubw     m0, m7
     psubw     m6, m4
     paddusw   m5, m2
@@ -303,27 +304,32 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     psubusw   m1, m3
     psubusw   m5, m2
     psubusw   m6, m4
-    movq      m4, m1
+    mova      m4, m1
     por       m4, m5
     paddusw   m0, m7
     por       m4, m6
     paddw     m0, m7
     packssdw  m4, m4
     psubusw   m0, m7
-    movd      tmpd, m4
-    or        tmpd, tmpd
+%if ARCH_X86_64
+    movq    tmpq, m4
+%else
+    packssdw  m4, m4
+    movd    tmpd, m4
+%endif
+    or      tmpq, tmpq
     jnz %1
-    movq      m4, [rsp]
+    mova      m4, [rsp]
     psraw     m3, m0, 2
     mova      m5, [outq+DCTSIZE*0*2]
     pmulhw    m1, m0, [pw_7642]
     pmulhw    m2, m0, [pw_4546]
     pmulhw    m0, [pw_5A82]
     paddw     m5, m4
-    movq      m6, [rsp+8]
+    mova      m6, [rsp+16]
     psubw     m2, m1
     psubw     m4, m3
-    movq      m7, [outq+DCTSIZE*1*2]
+    mova      m7, [outq+DCTSIZE*1*2]
     paddw     m5, m3
     psubw     m1, m3
     mova      [outq+DCTSIZE*7*2], m4
@@ -331,38 +337,37 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m2, m0
     mova      [outq+DCTSIZE*0*2], m5
     paddw     m7, m6
-    movq      m3, [rsp+8*2]
+    mova      m3, [rsp+16*2]
     psubw     m6, m1
-    movq      m4, [outq+DCTSIZE*2*2]
+    mova      m4, [outq+DCTSIZE*2*2]
     paddw     m7, m1
-    movq      [outq], m5
+    mova  [outq], m5
     paddw     m4, m3
-    movq      [outq+DCTSIZE*6*2], m6
+    mova      [outq+DCTSIZE*6*2], m6
     psubw     m3, m0
-    movq      m5, [outq+DCTSIZE*5*2]
+    mova      m5, [outq+DCTSIZE*5*2]
     paddw     m4, m0
-    movq      m6, [outq+DCTSIZE*3*2]
+    mova      m6, [outq+DCTSIZE*3*2]
     paddw     m5, m3
-    movq      m0, [rsp+8*3]
-    add       srcq, 8+%3
-    movq      [outq+DCTSIZE*1*2], m7
+    mova      m0, [rsp+16*3]
+    mova      [outq+DCTSIZE*1*2], m7
     paddw     m6, m0
-    movq      [outq+DCTSIZE*2*2], m4
+    mova      [outq+DCTSIZE*2*2], m4
     paddw     m0, m2
-    movq      m7, [outq+DCTSIZE*4*2]
+    mova      m7, [outq+DCTSIZE*4*2]
     psubw     m6, m2
-    movq      [outq+DCTSIZE*5*2], m5
+    mova      [outq+DCTSIZE*5*2], m5
     paddw     m7, m0
-    movq      [outq+DCTSIZE*3*2], m6
-    movq      [outq+DCTSIZE*4*2], m7
-    add       outq, 8+%3
+    mova      [outq+DCTSIZE*3*2], m6
+    mova      [outq+DCTSIZE*4*2], m7
+    add     outq, 32
 %endmacro
 
-%macro COLUMN_IDCT 0-1 0
-    movq      m3, m5
+%macro COLUMN_IDCT 0
+    mova      m3, m5
     psubw     m5, m1
     paddw     m3, m1
-    movq      m2, m0
+    mova      m2, m0
     psubw     m0, m6
     psllw     m1, m5, 1
     pmulhw    m1, [pw_AC62]
@@ -370,72 +375,64 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     pmulhw    m5, [pw_7642]
     paddw     m2, m6
     pmulhw    m0, [pw_4546]
-    movq      m7, m2
-    movq      m4, [rsp]
+    mova      m7, m2
+    mova      m4, [rsp]
     psubw     m2, m3
     paddw     m7, m3
     pmulhw    m2, [pw_5A82]
-    movq      m6, m4
+    mova      m6, m4
     psraw     m7, 2
     paddw     m4, [outq]
     psubw     m6, m7
-    movq      m3, [rsp+8]
+    mova      m3, [rsp+16]
     paddw     m4, m7
-    movq      [outq+DCTSIZE*7*2], m6
+    mova      [outq+DCTSIZE*7*2], m6
     paddw     m1, m5
-    movq      [outq], m4
+    mova  [outq], m4
     psubw     m1, m7
-    movq      m7, [rsp+8*2]
+    mova      m7, [rsp+16*2]
     psubw     m0, m5
-    movq      m6, [rsp+8*3]
-    movq      m5, m3
+    mova      m6, [rsp+16*3]
+    mova      m5, m3
     paddw     m3, [outq+DCTSIZE*1*2]
     psubw     m5, m1
     psubw     m2, m1
     paddw     m3, m1
-    movq      [outq+DCTSIZE*6*2], m5
-    movq      m4, m7
+    mova      [outq+DCTSIZE*6*2], m5
+    mova      m4, m7
     paddw     m7, [outq+DCTSIZE*2*2]
     psubw     m4, m2
     paddw     m4, [outq+DCTSIZE*5*2]
     paddw     m7, m2
-    movq      [outq+DCTSIZE*1*2], m3
+    mova      [outq+DCTSIZE*1*2], m3
     paddw     m0, m2
-    movq      [outq+DCTSIZE*2*2], m7
-    movq      m1, m6
+    mova      [outq+DCTSIZE*2*2], m7
+    mova      m1, m6
     paddw     m6, [outq+DCTSIZE*4*2]
     psubw     m1, m0
     paddw     m1, [outq+DCTSIZE*3*2]
     paddw     m6, m0
-    movq      [outq+DCTSIZE*5*2], m4
-    add       srcq, 8+%1
-    movq      [outq+DCTSIZE*4*2], m6
-    movq      [outq+DCTSIZE*3*2], m1
-    add       outq, 8+%1
+    mova      [outq+DCTSIZE*5*2], m4
+    mova      [outq+DCTSIZE*4*2], m6
+    mova      [outq+DCTSIZE*3*2], m1
+    add     outq, 32
 %endmacro
 
-INIT_MMX mmx
-;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt);
-cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
-.fdct1:
-    COLUMN_FDCT .idct1
-    jmp .fdct2
-
-.idct1:
-    COLUMN_IDCT
-
-.fdct2:
-    COLUMN_FDCT .idct2, 8, 16
+;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt);
+cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp
+.fdct:
+    COLUMN_FDCT .idct
     sub    cntd, 2
-    jg .fdct1
+    jg .fdct
     RET
 
-.idct2:
-    COLUMN_IDCT 16
+.idct:
+    COLUMN_IDCT
     sub    cntd, 2
-    jg .fdct1
+    jg .fdct
     RET
 
+INIT_MMX mmx
 ;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
 cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
     add       strideq, strideq
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index ee875547d2..c7a9b1799e 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -30,7 +30,7 @@ void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt);
+void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt);
 void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
 void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t 
line_size, int cnt);
 
@@ -39,7 +39,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMX(cpu_flags)) {
-        s->column_fidct = ff_column_fidct_mmx;
         s->row_idct     = ff_row_idct_mmx;
         s->row_fdct     = ff_row_fdct_mmx;
     }
@@ -47,5 +46,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
         s->store_slice  = ff_store_slice_sse2;
         s->store_slice2 = ff_store_slice2_sse2;
         s->mul_thrmat   = ff_mul_thrmat_sse2;
+        s->column_fidct = ff_column_fidct_sse2;
     }
 }
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index de407739d9..ccb9cd1e7d 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -136,8 +136,8 @@ static void check_column_fidct(void)
         NB_BLOCKS = 8, ///< arbitrary
     };
     FSPPDSPContext fspp;
-    declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data,
-                      int16_t *output, int cnt);
+    declare_func(void, int16_t *thr_adr, int16_t *data,
+                       int16_t *output, int cnt);
 
     ff_fsppdsp_init(&fspp);
 

commit 68b11cde8212b8ea0309ef6d11b01c782fa0b943
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 21:03:06 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:11 2025 +0100

    tests/checkasm/vf_fspp: Add test for column_fidct
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index 29b91f98d7..de407739d9 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -36,6 +36,12 @@
             buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \
     } while (0)
 
+#define randomize_buffer_range(buf, min, max)               \
+    do {                                                    \
+        for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j)    \
+            buf[j] = min + rnd() % (max - min + 1);         \
+    } while (0)
+
 static void check_store_slice(void)
 {
     enum {
@@ -124,8 +130,41 @@ static void check_mul_thrmat(void)
     }
 }
 
+static void check_column_fidct(void)
+{
+    enum {
+        NB_BLOCKS = 8, ///< arbitrary
+    };
+    FSPPDSPContext fspp;
+    declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data,
+                      int16_t *output, int cnt);
+
+    ff_fsppdsp_init(&fspp);
+
+    if (check_func(fspp.column_fidct, "column_fidct")) {
+        DECLARE_ALIGNED(16, int16_t, threshold)[64];
+        DECLARE_ALIGNED(16, int16_t, src)[8*(8*NB_BLOCKS + 6)];
+        DECLARE_ALIGNED(16, int16_t, dst_new)[8*(8*NB_BLOCKS + 6)];
+        DECLARE_ALIGNED(16, int16_t, dst_ref)[8*(8*NB_BLOCKS + 6)];
+
+        randomize_buffer_range(threshold, 0, INT16_MAX);
+        randomize_buffer_range(src, -1284, 1284);
+        randomize_buffers(dst_new);
+        memcpy(dst_ref, dst_new, sizeof(dst_ref));
+
+        call_ref(threshold, src, dst_ref, NB_BLOCKS * 8);
+        call_new(threshold, src, dst_new, NB_BLOCKS * 8);
+
+        if (memcmp(dst_new, dst_ref, sizeof(dst_new)))
+            fail();
+
+        bench_new(threshold, src, dst_new, NB_BLOCKS * 8);
+    }
+}
+
 void checkasm_check_vf_fspp(void)
 {
     check_store_slice();
     check_mul_thrmat();
+    check_column_fidct();
 }

commit 63493bf0e0909e701b64392be419f69491b8cbf1
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 19:39:35 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:11 2025 +0100

    avfilter/x86/vf_fspp: Put shifts into constants
    
    This avoids some shift instructions and also gives us more headroom
    in the registers. In fact, I have proven to myself that everything
    that is supposed to fit into 16bits now actually does so.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 2f49945c13..f61efc99f8 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -27,10 +27,13 @@ SECTION_RODATA
 
 cextern fspp_dither
 pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
+pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14)
 pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
+pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13)
 pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
 pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
 pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
+pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13)
 pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
 pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
 pw_4:    times 4 dw 4
@@ -211,12 +214,12 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     psubw     m2, m6
     paddw     m7, m1
     movq      m6, [thrq+4*16+%2]
-    psllw     m7, 2
+    psllw     m7, 1
     psubw     m5, [thrq+%2]
     psubw     m2, m6
     paddusw   m5, [thrq+%2]
     paddusw   m2, m6
-    pmulhw    m7, [pw_2D41]
+    pmulhw    m7, [pw_5A82]
     paddw     m5, [thrq+%2]
     paddw     m2, m6
     psubusw   m5, [thrq+%2]
@@ -261,15 +264,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m4, m0
     movq      m7, m3
     psubw     m3, m4
-    psllw     m3, 2
-    psllw     m7, 2
-    pmulhw    m3, [pw_187E]
+    psllw     m7, 1
+    pmulhw    m3, [pw_61F8]
     psllw     m4, 2
-    pmulhw    m7, [pw_22A3]
-    psllw     m2, 2
+    pmulhw    m7, [pw_4546]
+    psllw     m2, 1
     pmulhw    m4, [pw_539F]
     paddw     m5, m1
-    pmulhw    m2, [pw_2D41]
+    pmulhw    m2, [pw_5A82]
     psubw     m6, m1
     paddw     m7, m3
     movq      [rsp+8], m5
@@ -313,11 +315,10 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     jnz %1
     movq      m4, [rsp]
     psraw     m3, m0, 2
-    psllw     m0, 1
     mova      m5, [outq+DCTSIZE*0*2]
-    pmulhw    m1, m0, [pw_3B21]
-    pmulhw    m2, m0, [pw_22A3]
-    pmulhw    m0, [pw_2D41]
+    pmulhw    m1, m0, [pw_7642]
+    pmulhw    m2, m0, [pw_4546]
+    pmulhw    m0, [pw_5A82]
     paddw     m5, m4
     movq      m6, [rsp+8]
     psubw     m2, m1
@@ -360,23 +361,20 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
 %macro COLUMN_IDCT 0-1 0
     movq      m3, m5
     psubw     m5, m1
-    psllw     m5, 1
     paddw     m3, m1
     movq      m2, m0
     psubw     m0, m6
-    movq      m1, m5
-    psllw     m0, 1
+    psllw     m1, m5, 1
     pmulhw    m1, [pw_AC62]
     paddw     m5, m0
-    pmulhw    m5, [pw_3B21]
+    pmulhw    m5, [pw_7642]
     paddw     m2, m6
-    pmulhw    m0, [pw_22A3]
+    pmulhw    m0, [pw_4546]
     movq      m7, m2
     movq      m4, [rsp]
     psubw     m2, m3
-    psllw     m2, 1
     paddw     m7, m3
-    pmulhw    m2, [pw_2D41]
+    pmulhw    m2, [pw_5A82]
     movq      m6, m4
     psraw     m7, 2
     paddw     m4, [outq]

commit 66af18d06a3faf9f8960ad6bd5a400701a0cdaab
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 18:44:49 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:11 2025 +0100

    avfilter/x86/vf_fspp: Make ff_column_fidct_mmx() bitexact
    
    It currently is not, because the shortcut mode uses different rounding
    than the C code (as well as the non-shortcut code).
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 489e69f8ce..2f49945c13 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -33,9 +33,6 @@ pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
 pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
 pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
 pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
-pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
-pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
-pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
 pw_4:    times 4 dw 4
 pw_2:    times 4 dw 2
 
@@ -315,31 +312,34 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     or        tmpd, tmpd
     jnz %1
     movq      m4, [rsp]
-    movq      m1, m0
-    pmulhw    m0, [pw_3642]
-    movq      m2, m1
-    movq      m5, [outq+DCTSIZE*0*2]
-    movq      m3, m2
-    pmulhw    m1, [pw_2441]
+    psraw     m3, m0, 2
+    psllw     m0, 1
+    mova      m5, [outq+DCTSIZE*0*2]
+    pmulhw    m1, m0, [pw_3B21]
+    pmulhw    m2, m0, [pw_22A3]
+    pmulhw    m0, [pw_2D41]
     paddw     m5, m4
     movq      m6, [rsp+8]
-    psraw     m3, 2
-    pmulhw    m2, [pw_0CBB]
+    psubw     m2, m1
     psubw     m4, m3
     movq      m7, [outq+DCTSIZE*1*2]
     paddw     m5, m3
-    movq      [outq+DCTSIZE*7*2], m4
+    psubw     m1, m3
+    mova      [outq+DCTSIZE*7*2], m4
+    psubw     m0, m1
+    paddw     m2, m0
+    mova      [outq+DCTSIZE*0*2], m5
     paddw     m7, m6
     movq      m3, [rsp+8*2]
-    psubw     m6, m0
+    psubw     m6, m1
     movq      m4, [outq+DCTSIZE*2*2]
-    paddw     m7, m0
+    paddw     m7, m1
     movq      [outq], m5
     paddw     m4, m3
     movq      [outq+DCTSIZE*6*2], m6
-    psubw     m3, m1
+    psubw     m3, m0
     movq      m5, [outq+DCTSIZE*5*2]
-    paddw     m4, m1
+    paddw     m4, m0
     movq      m6, [outq+DCTSIZE*3*2]
     paddw     m5, m3
     movq      m0, [rsp+8*3]
@@ -347,9 +347,9 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     movq      [outq+DCTSIZE*1*2], m7
     paddw     m6, m0
     movq      [outq+DCTSIZE*2*2], m4
-    psubw     m0, m2
+    paddw     m0, m2
     movq      m7, [outq+DCTSIZE*4*2]
-    paddw     m6, m2
+    psubw     m6, m2
     movq      [outq+DCTSIZE*5*2], m5
     paddw     m7, m0
     movq      [outq+DCTSIZE*3*2], m6

commit 1049a5fba8f9437b94050105be8d32545675315e
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 14:21:09 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100

    avfilter/vf_fsppdsp: Reduce discrepancies between C code and x86 asm
    
    The x86 assembly uses the following pattern to zero all
    the values with abs<threshold:
        x -= threshold;
        x satu+= threshold (unsigned saturated addition)
        x += threshold
        x satu-= threshold (unsigned saturated subtraction)
    The reference C code meanwhile zeroed everything
    with abs <= threshold. This commit makes the C code behave
    like the x86 assembly to reduce discrepancies between the two.
    
    An alternative would be to require SSSE3, so that
    one can use pabsw, pcmpgtw for abs>threshold, followed by
    a pand with the original data. Or one could modify the thresholds
    to make both equal.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index 583571bf94..e530bcd06b 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -34,7 +34,7 @@
 
 #define MULTIPLY16H(x,k)   (((x) * (k)) >> 16)
 #define THRESHOLD(r,x,t)                         \
-    if(((unsigned)((x) + t)) > t * 2) r = (x);   \
+    if (((unsigned)((x) + t)) >= t * 2) r = (x); \
     else r = 0;
 #define DESCALE(x,n)  (((x) + (1 << ((n) - 1))) >> n)
 

commit d19050a1ae90b4ad8e9e2dadc5c8ca0c39301d69
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 10 23:03:23 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100

    avfilter/vf_fsppdsp: Use restrict
    
    It is possible because the requirements are fulfilled;
    it is also beneficial performance and code-size wise.
    For GCC 14 (with -O3), this reduced codesize by 26750B
    here; for Clang 20, it was 432B.
    
    Old benchmarks:
    mul_thrmat_c:                                            4.3 ( 1.00x)
    mul_thrmat_sse2:                                         4.3 ( 1.00x)
    store_slice_c:                                        2810.8 ( 1.00x)
    store_slice_sse2:                                      542.5 ( 5.18x)
    store_slice2_c:                                       3817.0 ( 1.00x)
    store_slice2_sse2:                                     410.4 ( 9.30x)
    
    New benchmarks:
    mul_thrmat_c:                                            4.3 ( 1.00x)
    mul_thrmat_sse2:                                         4.3 ( 1.00x)
    store_slice_c:                                        1510.1 ( 1.00x)
    store_slice_sse2:                                      545.2 ( 2.77x)
    store_slice2_c:                                       1763.5 ( 1.00x)
    store_slice2_sse2:                                     408.3 ( 4.32x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index f3f7c87174..583571bf94 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -64,7 +64,7 @@ DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
 };
 
 //This func reads from 1 slice, 1 and clears 0 & 1
-void ff_store_slice_c(uint8_t *dst, int16_t *src,
+void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
 {
@@ -93,7 +93,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
 }
 
 //This func reads from 2 slices, 0 & 2  and clears 2-nd
-void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
 {
@@ -121,13 +121,14 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
     }
 }
 
-void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
+void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, 
int q)
 {
     for (int a = 0; a < 64; a++)
         thr_adr[a] = q * thr_adr_noq[a];
 }
 
-void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt)
+void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+                       int16_t *restrict output, int cnt)
 {
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -249,7 +250,8 @@ void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, 
int16_t *output, int cnt
     }
 }
 
-void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt)
+void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+                   ptrdiff_t output_stride, int cnt)
 {
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -311,7 +313,8 @@ void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, 
ptrdiff_t output_str
     }
 }
 
-void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, 
int cnt)
+void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
+                   ptrdiff_t line_size, int cnt)
 {
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     int_simd16_t tmp10, tmp11, tmp12, tmp13;
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index b440809f02..66030da4b1 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -31,40 +31,43 @@
 #include "libavutil/attributes_internal.h"
 
 typedef struct FSPPDSPContext {
-    void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */,
+    void (*store_slice)(uint8_t *restrict dst, int16_t *restrict src /* align 
16 */,
                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 
-    void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */,
+    void (*store_slice2)(uint8_t *restrict dst, int16_t *restrict src /* align 
16 */,
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 
-    void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */,
-                       int16_t *thr_adr /* align 16 */, int q);
+    void (*mul_thrmat)(int16_t *restrict thr_adr_noq /* align 16 */,
+                       int16_t *restrict thr_adr /* align 16 */, int q);
 
-    void (*column_fidct)(int16_t *thr_adr, int16_t *data,
-                         int16_t *output, int cnt);
+    void (*column_fidct)(int16_t *restrict thr_adr, int16_t *data,
+                         int16_t *restrict output, int cnt);
 
-    void (*row_idct)(int16_t *workspace, int16_t *output_adr,
+    void (*row_idct)(int16_t *restrict workspace, int16_t *restrict output_adr,
                      ptrdiff_t output_stride, int cnt);
 
-    void (*row_fdct)(int16_t *data, const uint8_t *pixels,
+    void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels,
                      ptrdiff_t line_size, int cnt);
 } FSPPDSPContext;
 
 FF_VISIBILITY_PUSH_HIDDEN
 extern const uint8_t ff_fspp_dither[8][8];
 
-void ff_store_slice_c(uint8_t *dst, int16_t *src,
+void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
-void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt);
-void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
-void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, 
int cnt);
+void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, 
int q);
+void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+                       int16_t *restrict output, int cnt);
+void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+                   ptrdiff_t output_stride, int cnt);
+void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
+                   ptrdiff_t line_size, int cnt);
 
 void ff_fsppdsp_init_x86(FSPPDSPContext *fspp);
 FF_VISIBILITY_POP_HIDDEN

commit ff85a20b7db4d3226ada8533b181989944f30e75
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 10 22:06:34 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100

    avfilter/x86/vf_fspp: Port store_slice to SSE2
    
    Old benchmarks:
    store_slice_c:                                        2798.3 ( 1.00x)
    store_slice_mmx:                                       950.2 ( 2.94x)
    store_slice2_c:                                       3811.7 ( 1.00x)
    store_slice2_mmx:                                      682.3 ( 5.59x)
    
    New benchmarks:
    store_slice_c:                                        2797.2 ( 1.00x)
    store_slice_sse2:                                      543.5 ( 5.15x)
    store_slice2_c:                                       3817.0 ( 1.00x)
    store_slice2_sse2:                                     408.2 ( 9.35x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index e87fa6861c..b440809f02 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -31,11 +31,11 @@
 #include "libavutil/attributes_internal.h"
 
 typedef struct FSPPDSPContext {
-    void (*store_slice)(uint8_t *dst, int16_t *src,
+    void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */,
                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 
-    void (*store_slice2)(uint8_t *dst, int16_t *src,
+    void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */,
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index c9408978d8..489e69f8ce 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -43,15 +43,15 @@ SECTION .text
 
 %define DCTSIZE 8
 
-INIT_MMX mmx
+INIT_XMM sse2
 
-;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
-;                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
-;                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale)
+;void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
+;                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
+;                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale)
 %if ARCH_X86_64
-cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, 
dither_height, dither, tmp, tmp2
+cglobal store_slice, 7, 9, 5, dst, src, dst_stride, src_stride, width, 
dither_height, dither, tmp, tmp2
 %else
-cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
 %define dst_strideq r2m
 %define src_strideq r3m
     mov       widthq, r4m
@@ -62,7 +62,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, 
dither, tmp, tmp2
     mov       tmpq, src_strideq
     and       widthq, ~7
     sub       dst_strideq, widthq
-    movd      m5, ditherd ; log2_scale
+    movd      m4, ditherd ; log2_scale
     xor       ditherq, -1 ; log2_scale
     mov       tmp2q, tmpq
     add       ditherq, 7 ; log2_scale
@@ -74,29 +74,21 @@ cglobal store_slice, 2, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     mov       src_strideq, tmp2q
     shl       tmpq, 4
     lea       dither_heightq, [ditherq+dither_heightq*8]
-    pxor      m7, m7
+    pxor      m1, m1
 
 .loop_height:
     movq      m3, [ditherq]
-    movq      m4, m3
-    punpcklbw m3, m7
-    punpckhbw m4, m7
+    punpcklbw m3, m1
     mov       tmp2q, widthq
-    psraw     m3, m5
-    psraw     m4, m5
+    psraw     m3, m4
 
 .loop_width:
-    movq      [srcq+tmpq], m7
-    movq      m0, [srcq]
-    movq      m1, [srcq+8]
-    movq      [srcq+tmpq+8], m7
+    mova      m0, [srcq]
+    mova      [srcq+tmpq], m1
     paddw     m0, m3
-    paddw     m1, m4
-    movq      [srcq], m7
+    mova      [srcq], m1
     psraw     m0, m2
-    psraw     m1, m2
-    movq      [srcq+8], m7
-    packuswb  m0, m1
+    packuswb  m0, m0
     add       srcq, 16
     movq      [dstq], m0
     add       dstq, 8
@@ -110,13 +102,13 @@ cglobal store_slice, 2, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     jl .loop_height
     RET
 
-;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
-;                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
-;                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale)
+;void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
+;                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
+;                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale)
 %if ARCH_X86_64
-cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, 
dither_height, dither, tmp, tmp2
+cglobal store_slice2, 7, 9, 5, dst, src, dst_stride, src_stride, width, 
dither_height, dither, tmp, tmp2
 %else
-cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, 
tmp2
+cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp, 
tmp2
 %define dst_strideq r2m
 %define src_strideq r3m
     mov       dstq, dstm
@@ -129,7 +121,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     mov       tmpq, src_strideq
     and       widthq, ~7
     sub       dst_strideq, widthq
-    movd      m5, ditherd ; log2_scale
+    movd      m4, ditherd ; log2_scale
     xor       ditherq, -1 ; log2_scale
     mov       tmp2q, tmpq
     add       ditherq, 7 ; log2_scale
@@ -140,30 +132,21 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     mov       src_strideq, tmp2q
     shl       tmpq, 5
     lea       dither_heightq, [ditherq+dither_heightq*8]
-    pxor      m7, m7
+    pxor      m1, m1
 
 .loop_height:
     movq      m3, [ditherq]
-    movq      m4, m3
-    punpcklbw m3, m7
-    punpckhbw m4, m7
+    punpcklbw m3, m1
     mov       tmp2q,widthq
-    psraw     m3, m5
-    psraw     m4, m5
+    psraw     m3, m4
 
 .loop_width:
-    movq      m0, [srcq]
-    movq      m1, [srcq+8]
+    mova      m0, [srcq]
     paddw     m0, m3
     paddw     m0, [srcq+tmpq]
-    paddw     m1, m4
-    movq      m6, [srcq+tmpq+8]
-    movq      [srcq+tmpq], m7
+    mova      [srcq+tmpq], m1
     psraw     m0, m2
-    paddw     m1, m6
-    movq      [srcq+tmpq+8], m7
-    psraw     m1, m2
-    packuswb  m0, m1
+    packuswb  m0, m0
     movq      [dstq], m0
     add       srcq, 16
     add       dstq, 8
@@ -178,7 +161,6 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     RET
 
 ;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-INIT_XMM sse2
 cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     movd      m4, qd
     mova      m0, [thrnq]
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 9f6095ce24..ee875547d2 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -23,12 +23,12 @@
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/vf_fsppdsp.h"
 
-void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
-                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
-void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
+void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
+                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
 void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt);
 void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
@@ -39,13 +39,13 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMX(cpu_flags)) {
-        s->store_slice  = ff_store_slice_mmx;
-        s->store_slice2 = ff_store_slice2_mmx;
         s->column_fidct = ff_column_fidct_mmx;
         s->row_idct     = ff_row_idct_mmx;
         s->row_fdct     = ff_row_fdct_mmx;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
+        s->store_slice  = ff_store_slice_sse2;
+        s->store_slice2 = ff_store_slice2_sse2;
         s->mul_thrmat   = ff_mul_thrmat_sse2;
     }
 }

commit 570f8fc6c9850edf6c05d58dea0629f162199f20
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 10 21:57:45 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100

    tests/checkasm/vf_fspp: Test store_slice
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index 117e1c670e..29b91f98d7 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -16,8 +16,12 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include <stddef.h>
+#include <stdint.h>
+
 #include "checkasm.h"
 #include "libavfilter/vf_fsppdsp.h"
+#include "libavcodec/mathops.h"
 #include "libavutil/mem_internal.h"
 
 #define randomize_buffers(buf)                           \
@@ -26,6 +30,78 @@
             buf[j] = rnd();                              \
     } while (0)
 
+#define randomize_mask_buffers(buf, buf2, nb_elems, nb_bits)\
+    do {                                                    \
+        for (size_t j = 0; j < nb_elems; ++j)               \
+            buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \
+    } while (0)
+
+static void check_store_slice(void)
+{
+    enum {
+        MAX_WIDTH  = 256,
+        /// in elements, not in bytes; 32 is arbitrary
+        MAX_STRIDE = MAX_WIDTH + 32,
+        MAX_HEIGHT = 8,
+    };
+    FSPPDSPContext fspp;
+    ff_fsppdsp_init(&fspp);
+    declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *src,
+                      ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                      ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+
+    for (int i = 0; i < 2; ++i) {
+        if (check_func(i ? fspp.store_slice2 : fspp.store_slice, 
"store_slice%s", i ? "2" : "")) {
+            // store slice resets the row eight lines above the current one
+            DECLARE_ALIGNED(16, int16_t, src_ref1)[MAX_STRIDE * ( 8 + 
MAX_HEIGHT - 1) + MAX_WIDTH];
+            DECLARE_ALIGNED(16, int16_t, src_new1)[MAX_STRIDE * ( 8 + 
MAX_HEIGHT - 1) + MAX_WIDTH];
+            // store_slice2 resets the row 16 lines below the current one
+            DECLARE_ALIGNED(16, int16_t, src_ref2)[MAX_STRIDE * (16 + 
MAX_HEIGHT - 1) + MAX_WIDTH];
+            DECLARE_ALIGNED(16, int16_t, src_new2)[MAX_STRIDE * (16 + 
MAX_HEIGHT - 1) + MAX_WIDTH];
+            uint8_t dstbuf_new[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH], 
dstbuf_ref[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH];
+            uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref;
+            int16_t *src_ref, *src_new, *or_src_ref, *or_src_new;
+            ptrdiff_t      width = 1 + rnd() % MAX_WIDTH;
+            ptrdiff_t src_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - 
MAX_WIDTH), 8);
+            ptrdiff_t dst_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - 
MAX_WIDTH), 8);
+            ptrdiff_t height = 1 + rnd() % 8;
+            size_t nb_elems;
+
+            if (i) {
+                src_ref      = src_ref2;
+                src_new      = src_new2;
+                or_src_ref   = src_ref2;
+                or_src_new   = src_new2;
+                nb_elems     = FF_ARRAY_ELEMS(src_ref2);
+            } else {
+                src_ref      = src_ref1 + 8 * src_stride;
+                src_new      = src_new1 + 8 * src_stride;
+                or_src_ref   = src_ref1;
+                or_src_new   = src_new1;
+                nb_elems     = FF_ARRAY_ELEMS(src_ref1);
+            }
+            if (rnd() & 1) {
+                dst_ref    += dst_stride * (height - 1);
+                dst_new    += dst_stride * (height - 1);
+                dst_stride *= -1;
+            }
+            randomize_buffers(dstbuf_new);
+            memcpy(dstbuf_ref, dstbuf_new, sizeof(dstbuf_ref));
+            randomize_mask_buffers(or_src_ref, or_src_new, nb_elems, 14);
+
+            ptrdiff_t log2_scale = rnd() & 1;
+            call_ref(dst_ref, src_ref, dst_stride, src_stride, width, height, 
log2_scale);
+            call_new(dst_new, src_new, dst_stride, src_stride, width, height, 
log2_scale);
+            if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_ref)) ||
+                memcmp(or_src_ref, or_src_new, sizeof(*or_src_new) * nb_elems))
+                fail();
+            // don't use random parameters for benchmarks
+            src_ref = or_src_ref + !i * 8 * MAX_STRIDE;
+            bench_new(dstbuf_new, src_ref,
+                      MAX_STRIDE, MAX_STRIDE, MAX_WIDTH, 8, 1);
+        }
+    }
+}
 
 static void check_mul_thrmat(void)
 {
@@ -50,5 +126,6 @@ static void check_mul_thrmat(void)
 
 void checkasm_check_vf_fspp(void)
 {
+    check_store_slice();
     check_mul_thrmat();
 }

commit e042f17e9947779e3b1b981218370472940ca3c6
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 10 12:54:31 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100

    avfilter/vf_fsppdsp: Use standard clamping
    
    This is obviously what is intended and what the MMX code does;
    yet I cannot rule out that it changes the output for some inputs:
    I have observed individual src values which would lead to temp
    values just above 512 if they came in pairs (i.e. if both inputs
    were simultaneously huge).
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index b84d7b57bb..f3f7c87174 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -24,6 +24,7 @@
 
 #include "vf_fsppdsp.h"
 
+#include "libavutil/common.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/mem_internal.h"
 
@@ -70,7 +71,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
 #define STORE(pos)                                                             
\
     temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        
\
     src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          
\
-    if (temp & 0x100) temp = ~(temp >> 31);                                    
\
+    temp = av_clip_uint8(temp);                                                
\
     dst[x + pos] = temp;
 
     for (int y = 0; y < height; y++) {
@@ -99,7 +100,7 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
 #define STORE2(pos)                                                            
                           \
     temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> 
log2_scale)) >> (6 - log2_scale);  \
     src[x + pos + 16 * src_stride] = 0;                                        
                           \
-    if (temp & 0x100) temp = ~(temp >> 31);                                    
                           \
+    temp = av_clip_uint8(temp);                                                
                           \
     dst[x + pos] = temp;
 
     for (int y = 0; y < height; y++) {

commit 52ba2ac7bd48d09d1f8527376970e2b0e8ee5068
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 9 19:10:30 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100

    avfilter/x86/vf_fspp: Port mul_thrmat to SSE2
    
    This fixes an ABI violation, as mul_thrmat did not issue emms.
    It seems that this ABI violation could reach the user, namely
    if ff_get_video_buffer() fails. Notice that ff_get_video_buffer()
    itself could fail because of this, namely if the allocator uses
    floating point registers.
    
    On x64 (where GCC already used SSE2 in the C version)
    mul_thrmat_c:                                            4.4 ( 1.00x)
    mul_thrmat_mmx:                                          8.6 ( 0.52x)
    mul_thrmat_sse2:                                         4.4 ( 1.00x)
    
    On 32bit (where SSE2 is not known to be available):
    mul_thrmat_c:                                           56.0 ( 1.00x)
    mul_thrmat_sse2:                                         6.0 ( 9.40x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 9371c63e77..fa562cbd45 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -54,8 +54,6 @@
 
 typedef struct FSPPContext {
     const struct AVClass *class;
-    uint64_t threshold_mtx_noq[8 * 2];
-    uint64_t threshold_mtx[8 * 2];        //used in both C & MMX (& later 
SSE2) versions
 
     int log2_count;
     int strength;
@@ -72,6 +70,9 @@ typedef struct FSPPContext {
     int use_bframe_qp;
 
     FSPPDSPContext dsp;
+
+    DECLARE_ALIGNED(16, uint64_t, threshold_mtx_noq)[8 * 2];
+    DECLARE_ALIGNED(16, uint64_t, threshold_mtx)[8 * 2];
 } FSPPContext;
 
 
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index 0dbd628abf..e87fa6861c 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -39,7 +39,8 @@ typedef struct FSPPDSPContext {
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 
-    void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+    void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */,
+                       int16_t *thr_adr /* align 16 */, int q);
 
     void (*column_fidct)(int16_t *thr_adr, int16_t *data,
                          int16_t *output, int cnt);
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 0ea6216193..c9408978d8 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -177,59 +177,36 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     jl .loop_height
     RET
 
-;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
-    movd      m7, qd
-    movq      m0, [thrnq]
-    punpcklwd m7, m7
-    movq      m1, [thrnq+8]
-    punpckldq m7, m7
-    pmullw    m0, m7
-    movq      m2, [thrnq+8*2]
-    pmullw    m1, m7
-    movq      m3, [thrnq+8*3]
-    pmullw    m2, m7
-    movq      [thrq], m0
-    movq      m4, [thrnq+8*4]
-    pmullw    m3, m7
-    movq      [thrq+8], m1
-    movq      m5, [thrnq+8*5]
-    pmullw    m4, m7
-    movq      [thrq+8*2], m2
-    movq      m6, [thrnq+8*6]
-    pmullw    m5, m7
-    movq      [thrq+8*3], m3
-    movq      m0, [thrnq+8*7]
-    pmullw    m6, m7
-    movq      [thrq+8*4], m4
-    movq      m1, [thrnq+8*7+8]
-    pmullw    m0, m7
-    movq      [thrq+8*5], m5
-    movq      m2, [thrnq+8*7+8*2]
-    pmullw    m1, m7
-    movq      [thrq+8*6], m6
-    movq      m3, [thrnq+8*7+8*3]
-    pmullw    m2, m7
-    movq      [thrq+8*7], m0
-    movq      m4, [thrnq+8*7+8*4]
-    pmullw    m3, m7
-    movq      [thrq+8*7+8], m1
-    movq      m5, [thrnq+8*7+8*5]
-    pmullw    m4, m7
-    movq      [thrq+8*7+8*2], m2
-    movq      m6, [thrnq+8*7+8*6]
-    pmullw    m5, m7
-    movq      [thrq+8*7+8*3], m3
-    movq      m0, [thrnq+14*8]
-    pmullw    m6, m7
-    movq      [thrq+8*7+8*4], m4
-    movq      m1, [thrnq+14*8+8]
-    pmullw    m0, m7
-    movq      [thrq+8*7+8*5], m5
-    pmullw    m1, m7
-    movq      [thrq+8*7+8*6], m6
-    movq      [thrq+14*8], m0
-    movq      [thrq+14*8+8], m1
+;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+INIT_XMM sse2
+cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
+    movd      m4, qd
+    mova      m0, [thrnq]
+    punpcklwd m4, m4
+    mova      m1, [thrnq+16]
+    pshufd    m4, m4, 0
+    pmullw    m0, m4
+    mova      m2, [thrnq+16*2]
+    pmullw    m1, m4
+    mova      m3, [thrnq+16*3]
+    pmullw    m2, m4
+    mova      [thrq], m0
+    mova      m0, [thrnq+16*4]
+    pmullw    m3, m4
+    mova      [thrq+16], m1
+    mova      m1, [thrnq+16*5]
+    pmullw    m0, m4
+    mova      [thrq+16*2], m2
+    mova      m2, [thrnq+16*6]
+    pmullw    m1, m4
+    mova      [thrq+16*3], m3
+    mova      m3, [thrnq+16*7]
+    pmullw    m2, m4
+    mova      [thrq+16*4], m0
+    pmullw    m3, m4
+    mova      [thrq+16*5], m1
+    mova      [thrq+16*6], m2
+    mova      [thrq+16*7], m3
     RET
 
 %macro COLUMN_FDCT 1-3 0, 0
@@ -457,6 +434,7 @@ cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
     add       outq, 8+%1
 %endmacro
 
+INIT_MMX mmx
 ;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt);
 cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
 .fdct1:
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 2aadb50967..9f6095ce24 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -29,7 +29,7 @@ void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
 void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
-void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
 void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt);
 void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
 void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t 
line_size, int cnt);
@@ -41,9 +41,11 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
     if (EXTERNAL_MMX(cpu_flags)) {
         s->store_slice  = ff_store_slice_mmx;
         s->store_slice2 = ff_store_slice2_mmx;
-        s->mul_thrmat   = ff_mul_thrmat_mmx;
         s->column_fidct = ff_column_fidct_mmx;
         s->row_idct     = ff_row_idct_mmx;
         s->row_fdct     = ff_row_fdct_mmx;
     }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->mul_thrmat   = ff_mul_thrmat_sse2;
+    }
 }
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index a84ae8d5af..117e1c670e 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -18,6 +18,7 @@
 
 #include "checkasm.h"
 #include "libavfilter/vf_fsppdsp.h"
+#include "libavutil/mem_internal.h"
 
 #define randomize_buffers(buf)                           \
     do {                                                 \
@@ -29,10 +30,11 @@
 static void check_mul_thrmat(void)
 {
     FSPPDSPContext fspp;
-    int16_t src[64];
-    int16_t dst_ref[64], dst_new[64];
+    DECLARE_ALIGNED(16, int16_t, src)[64];
+    DECLARE_ALIGNED(16, int16_t, dst_ref)[64];
+    DECLARE_ALIGNED(16, int16_t, dst_new)[64];
     const int q = (uint8_t)rnd();
-    declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr_noq, int16_t 
*thr_adr, int q);
+    declare_func(void, int16_t *thr_adr_noq, int16_t *thr_adr, int q);
 
     ff_fsppdsp_init(&fspp);
 

commit 70eb8a76a91e9c9fe3a6c0b4f1c2ff28f5447086
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 9 18:50:48 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100

    tests/checkasm: Add vf_fspp mul_thrmat test
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index e47070d90f..6636bc7774 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -64,6 +64,7 @@ AVFILTEROBJS-$(CONFIG_BWDIF_FILTER)      += vf_bwdif.o
 AVFILTEROBJS-$(CONFIG_COLORDETECT_FILTER)+= vf_colordetect.o
 AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
 AVFILTEROBJS-$(CONFIG_EQ_FILTER)         += vf_eq.o
+AVFILTEROBJS-$(CONFIG_FSPP_FILTER)       += vf_fspp.o
 AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
 AVFILTEROBJS-$(CONFIG_HFLIP_FILTER)      += vf_hflip.o
 AVFILTEROBJS-$(CONFIG_IDET_FILTER)       += vf_idet.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 4469e043f5..20d8f19757 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -297,6 +297,9 @@ static const struct {
     #if CONFIG_EQ_FILTER
         { "vf_eq", checkasm_check_vf_eq },
     #endif
+    #if CONFIG_FSPP_FILTER
+        { "vf_fspp", checkasm_check_vf_fspp },
+    #endif
     #if CONFIG_GBLUR_FILTER
         { "vf_gblur", checkasm_check_vf_gblur },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index e1ccd4011b..45cd23cac4 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -148,6 +148,7 @@ void checkasm_check_v210enc(void);
 void checkasm_check_vc1dsp(void);
 void checkasm_check_vf_bwdif(void);
 void checkasm_check_vf_eq(void);
+void checkasm_check_vf_fspp(void);
 void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
new file mode 100644
index 0000000000..a84ae8d5af
--- /dev/null
+++ b/tests/checkasm/vf_fspp.c
@@ -0,0 +1,52 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "checkasm.h"
+#include "libavfilter/vf_fsppdsp.h"
+
+#define randomize_buffers(buf)                           \
+    do {                                                 \
+        for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \
+            buf[j] = rnd();                              \
+    } while (0)
+
+
+static void check_mul_thrmat(void)
+{
+    FSPPDSPContext fspp;
+    int16_t src[64];
+    int16_t dst_ref[64], dst_new[64];
+    const int q = (uint8_t)rnd();
+    declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr_noq, int16_t 
*thr_adr, int q);
+
+    ff_fsppdsp_init(&fspp);
+
+    if (check_func(fspp.mul_thrmat, "mul_thrmat")) {
+        randomize_buffers(src);
+        call_ref(src, dst_ref, q);
+        call_new(src, dst_new, q);
+        if (memcmp(dst_ref, dst_new, sizeof(dst_ref)))
+            fail();
+        bench_new(src, dst_new, q);
+    }
+}
+
+void checkasm_check_vf_fspp(void)
+{
+    check_mul_thrmat();
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index ca1cd0dea3..2be880c8db 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -67,6 +67,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp                       
          \
                 fate-checkasm-vf_colordetect                            \
                 fate-checkasm-vf_colorspace                             \
                 fate-checkasm-vf_eq                                     \
+                fate-checkasm-vf_fspp                                   \
                 fate-checkasm-vf_gblur                                  \
                 fate-checkasm-vf_hflip                                  \
                 fate-checkasm-vf_nlmeans                                \

commit 9f4d5d818d709788ab6b199a634a95a2bfcd4898
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 9 17:27:16 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100

    avfilter/x86/vf_fspp: Don't duplicate dither table
    
    Reuse the one from vf_fsppdsp.c; also don't overalign said table too
    much.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index d2d04463b4..b84d7b57bb 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -51,7 +51,7 @@ enum {
     FIX_1_082392200   = FIX(1.082392200, 13),
 };
 
-DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
+DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
     {  0,  48,  12,  60,   3,  51,  15,  63, },
     { 32,  16,  44,  28,  35,  19,  47,  31, },
     {  8,  56,   4,  52,  11,  59,   7,  55, },
@@ -74,7 +74,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
     dst[x + pos] = temp;
 
     for (int y = 0; y < height; y++) {
-        const uint8_t *d = dither[y];
+        const uint8_t *d = ff_fspp_dither[y];
         for (int x = 0; x < width; x += 8) {
             int temp;
             STORE(0);
@@ -103,7 +103,7 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
     dst[x + pos] = temp;
 
     for (int y = 0; y < height; y++) {
-        const uint8_t *d = dither[y];
+        const uint8_t *d = ff_fspp_dither[y];
         for (int x = 0; x < width; x += 8) {
             int temp;
             STORE2(0);
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index c441b75094..0dbd628abf 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -52,6 +52,8 @@ typedef struct FSPPDSPContext {
 } FSPPDSPContext;
 
 FF_VISIBILITY_PUSH_HIDDEN
+extern const uint8_t ff_fspp_dither[8][8];
+
 void ff_store_slice_c(uint8_t *dst, int16_t *src,
                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index c7f8f64f1b..0ea6216193 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -25,10 +25,7 @@
 
 SECTION_RODATA
 
-pb_dither: db 0,  48,  12,  60,   3,  51,  15,  63, 32,  16,  44,  28,  35,  
19,  47,  31, \
-              8,  56,   4,  52,  11,  59,   7,  55, 40,  24,  36,  20,  43,  
27,  39,  23, \
-              2,  50,  14,  62,   1,  49,  13,  61, 34,  18,  46,  30,  33,  
17,  45,  29, \
-             10,  58,   6,  54,   9,  57,   5,  53, 42,  26,  38,  22,  41,  
25,  37,  21
+cextern fspp_dither
 pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
 pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
 pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
@@ -73,7 +70,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, 
dither, tmp, tmp2
     sub       tmp2q, widthq
     movd      m2, ditherd ; log2_scale
     add       tmp2q, tmp2q
-    lea       ditherq, [pb_dither]
+    lea       ditherq, [fspp_dither]
     mov       src_strideq, tmp2q
     shl       tmpq, 4
     lea       dither_heightq, [ditherq+dither_heightq*8]
@@ -139,7 +136,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     sub       tmp2q, widthq
     movd      m2, ditherd ; log2_scale
     add       tmp2q, tmp2q
-    lea       ditherq, [pb_dither]
+    lea       ditherq, [fspp_dither]
     mov       src_strideq, tmp2q
     shl       tmpq, 5
     lea       dither_heightq, [ditherq+dither_heightq*8]

commit 1699de09551da5efe413637fcb4c90bcaea31b4c
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 9 17:22:21 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100

    avfilter/vf_fsppdsp: Use enum for constants
    
    It means that the compiler does not have to optimize the static const
    object away.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index ab31c77203..d2d04463b4 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -29,7 +29,7 @@
 
 #define DCTSIZE 8
 
-#define FIX(x,s)  ((x) * (1 << s) + 0.5)
+#define FIX(x,s)  (int)((x) * (1 << s) + 0.5)
 
 #define MULTIPLY16H(x,k)   (((x) * (k)) >> 16)
 #define THRESHOLD(r,x,t)                         \
@@ -38,15 +38,18 @@
 #define DESCALE(x,n)  (((x) + (1 << ((n) - 1))) >> n)
 
 typedef int32_t int_simd16_t;
-static const int16_t FIX_0_382683433   = FIX(0.382683433, 14);
-static const int16_t FIX_0_541196100   = FIX(0.541196100, 14);
-static const int16_t FIX_0_707106781   = FIX(M_SQRT1_2  , 14);
-static const int16_t FIX_1_306562965   = FIX(1.306562965, 14);
-static const int16_t FIX_1_414213562_A = FIX(M_SQRT2    , 14);
-static const int16_t FIX_1_847759065   = FIX(1.847759065, 13);
-static const int16_t FIX_2_613125930   = FIX(-2.613125930, 13);
-static const int16_t FIX_1_414213562   = FIX(M_SQRT2    , 13);
-static const int16_t FIX_1_082392200   = FIX(1.082392200, 13);
+
+enum {
+    FIX_0_382683433   = FIX(0.382683433, 14),
+    FIX_0_541196100   = FIX(0.541196100, 14),
+    FIX_0_707106781   = FIX(M_SQRT1_2  , 14),
+    FIX_1_306562965   = FIX(1.306562965, 14),
+    FIX_1_414213562_A = FIX(M_SQRT2    , 14),
+    FIX_1_847759065   = FIX(1.847759065, 13),
+    FIX_2_613125930   = FIX(-2.613125930, 13),
+    FIX_1_414213562   = FIX(M_SQRT2    , 13),
+    FIX_1_082392200   = FIX(1.082392200, 13),
+};
 
 DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
     {  0,  48,  12,  60,   3,  51,  15,  63, },

commit 9b34088c4dfec112170a0a0102acb3be1d77d240
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 9 17:06:46 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100

    avfilter/vf_fspp: Add DSPCtx, move DSP functions to file of their own
    
    This is in preparation for adding checkasm tests; without it,
    checkasm would pull all of libavfilter in.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 69d74183b2..d56a458e45 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -329,7 +329,7 @@ OBJS-$(CONFIG_FRAMESTEP_FILTER)              += 
vf_framestep.o
 OBJS-$(CONFIG_FREEZEDETECT_FILTER)           += vf_freezedetect.o
 OBJS-$(CONFIG_FREEZEFRAMES_FILTER)           += vf_freezeframes.o
 OBJS-$(CONFIG_FREI0R_FILTER)                 += vf_frei0r.o
-OBJS-$(CONFIG_FSPP_FILTER)                   += vf_fspp.o qp_table.o
+OBJS-$(CONFIG_FSPP_FILTER)                   += vf_fspp.o vf_fsppdsp.o 
qp_table.o
 OBJS-$(CONFIG_FSYNC_FILTER)                  += vf_fsync.o
 OBJS-$(CONFIG_GBLUR_FILTER)                  += vf_gblur.o
 OBJS-$(CONFIG_GBLUR_VULKAN_FILTER)           += vf_gblur_vulkan.o vulkan.o 
vulkan_filter.o
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 6b4a715367..9371c63e77 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -41,12 +41,40 @@
 #include "libavutil/mem_internal.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/video_enc_params.h"
 
+#include "avfilter.h"
 #include "filters.h"
 #include "qp_table.h"
-#include "vf_fspp.h"
+#include "vf_fsppdsp.h"
 #include "video.h"
 
+#define BLOCKSZ  12
+#define MAX_LEVEL 5
+
+typedef struct FSPPContext {
+    const struct AVClass *class;
+    uint64_t threshold_mtx_noq[8 * 2];
+    uint64_t threshold_mtx[8 * 2];        //used in both C & MMX (& later 
SSE2) versions
+
+    int log2_count;
+    int strength;
+    int hsub;
+    int vsub;
+    int temp_stride;
+    int qp;
+    enum AVVideoEncParamsType qscale_type;
+    int prev_q;
+    uint8_t *src;
+    int16_t *temp;
+    int8_t  *non_b_qp_table;
+    int non_b_qp_stride;
+    int use_bframe_qp;
+
+    FSPPDSPContext dsp;
+} FSPPContext;
+
+
 #define OFFSET(x) offsetof(FSPPContext, x)
 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
 static const AVOption fspp_options[] = {
@@ -59,17 +87,6 @@ static const AVOption fspp_options[] = {
 
 AVFILTER_DEFINE_CLASS(fspp);
 
-DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
-    {  0,  48,  12,  60,   3,  51,  15,  63, },
-    { 32,  16,  44,  28,  35,  19,  47,  31, },
-    {  8,  56,   4,  52,  11,  59,   7,  55, },
-    { 40,  24,  36,  20,  43,  27,  39,  23, },
-    {  2,  50,  14,  62,   1,  49,  13,  61, },
-    { 34,  18,  46,  30,  33,  17,  45,  29, },
-    { 10,  58,   6,  54,   9,  57,   5,  53, },
-    { 42,  26,  38,  22,  41,  25,  37,  21, },
-};
-
 static const short custom_threshold[64] = {
 // values (296) can't be too high
 // -it causes too big quant dependence
@@ -84,73 +101,6 @@ static const short custom_threshold[64] = {
      20,  27,  26,  23,  20,  15,  11,   5
 };
 
-//This func reads from 1 slice, 1 and clears 0 & 1
-static void store_slice_c(uint8_t *dst, int16_t *src,
-                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale)
-{
-    int y, x;
-#define STORE(pos)                                                             
\
-    temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        
\
-    src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          
\
-    if (temp & 0x100) temp = ~(temp >> 31);                                    
\
-    dst[x + pos] = temp;
-
-    for (y = 0; y < height; y++) {
-        const uint8_t *d = dither[y];
-        for (x = 0; x < width; x += 8) {
-            int temp;
-            STORE(0);
-            STORE(1);
-            STORE(2);
-            STORE(3);
-            STORE(4);
-            STORE(5);
-            STORE(6);
-            STORE(7);
-        }
-        src += src_stride;
-        dst += dst_stride;
-    }
-}
-
-//This func reads from 2 slices, 0 & 2  and clears 2-nd
-static void store_slice2_c(uint8_t *dst, int16_t *src,
-                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale)
-{
-    int y, x;
-#define STORE2(pos)                                                            
                           \
-    temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> 
log2_scale)) >> (6 - log2_scale);  \
-    src[x + pos + 16 * src_stride] = 0;                                        
                           \
-    if (temp & 0x100) temp = ~(temp >> 31);                                    
                           \
-    dst[x + pos] = temp;
-
-    for (y = 0; y < height; y++) {
-        const uint8_t *d = dither[y];
-        for (x = 0; x < width; x += 8) {
-            int temp;
-            STORE2(0);
-            STORE2(1);
-            STORE2(2);
-            STORE2(3);
-            STORE2(4);
-            STORE2(5);
-            STORE2(6);
-            STORE2(7);
-        }
-        src += src_stride;
-        dst += dst_stride;
-    }
-}
-
-static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
-{
-    int a;
-    for (a = 0; a < 64; a++)
-        thr_adr[a] = q * thr_adr_noq[a];
-}
-
 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
                    int dst_stride, int src_stride,
                    int width, int height,
@@ -197,13 +147,13 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t 
*src,
         if (qy < 0) qy = 0;
 
         qy = (qy >> qpsv) * qp_stride;
-        p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
+        p->dsp.row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
 
         for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 
1)) {
-            p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - 
(y&1), stride, 2 * (BLOCKSZ - 1));
+            p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - 
(y&1), stride, 2 * (BLOCKSZ - 1));
 
             if (p->qp)
-                p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 
8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
+                p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 
0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
             else
                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
                     t = x + x0 - 2;                    //correct 
t=x+x0-2-(y&1), but its the same
@@ -213,288 +163,42 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t 
*src,
                     t = qp_store[qy + (t >> qpsh)];
                     t = ff_norm_qscale(t, p->qscale_type);
 
-                    if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t 
*)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
-                    p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 
x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
+                    if (t != p->prev_q) p->prev_q = t, 
p->dsp.mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t 
*)(&p->threshold_mtx[0]), t);
+                    p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), 
block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
                 }
-            p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - 
(y & 1), stride, 2 * (BLOCKSZ - 1));
+            p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 
2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * 
sizeof(int16_t)); //cycling
             memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * 
sizeof(int16_t));
         }
 
         es = width + 8 - x0; //  8, ...
         if (es > 8)
-            p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 
1), stride, (es - 4) >> 2);
+            p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - 
(y & 1), stride, (es - 4) >> 2);
 
-        p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, 
es&(~1));
+        p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, 
es&(~1));
         if (es > 3)
-            p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - 
(y & 1), stride, es >> 2);
+            p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 
2 - (y & 1), stride, es >> 2);
 
         if (!(y1 & 7) && y1) {
             if (y1 & 8)
-                p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * 
stride,
-                               dst_stride, stride, width, 8, 5 - 
p->log2_count);
+                p->dsp.store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 
8 * stride,
+                                   dst_stride, stride, width, 8, 5 - 
p->log2_count);
             else
-                p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * 
stride,
-                                dst_stride, stride, width, 8, 5 - 
p->log2_count);
+                p->dsp.store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 
0 * stride,
+                                    dst_stride, stride, width, 8, 5 - 
p->log2_count);
         }
     }
 
     if (y & 7) {  // height % 8 != 0
         if (y & 8)
-            p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 
* stride,
-                           dst_stride, stride, width, y&7, 5 - p->log2_count);
+            p->dsp.store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 
+ 8 * stride,
+                               dst_stride, stride, width, y&7, 5 - 
p->log2_count);
         else
-            p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 
* stride,
+            p->dsp.store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 
+ 0 * stride,
                             dst_stride, stride, width, y&7, 5 - p->log2_count);
     }
 }
 
-static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt)
-{
-    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    int_simd16_t tmp10, tmp11, tmp12, tmp13;
-    int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
-    int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
-
-    int16_t *dataptr;
-    int16_t *wsptr;
-    int16_t *threshold;
-    int ctr;
-
-    dataptr = data;
-    wsptr = output;
-
-    for (; cnt > 0; cnt -= 2) { //start positions
-        threshold = (int16_t *)thr_adr;//threshold_mtx
-        for (ctr = DCTSIZE; ctr > 0; ctr--) {
-            // Process columns from input, add to output.
-            tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
-            tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
-
-            tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
-            tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
-
-            tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
-            tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
-
-            tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
-            tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
-
-            // Even part of FDCT
-
-            tmp10 = tmp0 + tmp3;
-            tmp13 = tmp0 - tmp3;
-            tmp11 = tmp1 + tmp2;
-            tmp12 = tmp1 - tmp2;
-
-            d0 = tmp10 + tmp11;
-            d4 = tmp10 - tmp11;
-
-            z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
-            d2 = tmp13 + z1;
-            d6 = tmp13 - z1;
-
-            // Even part of IDCT
-
-            THRESHOLD(tmp0, d0, threshold[0 * 8]);
-            THRESHOLD(tmp1, d2, threshold[2 * 8]);
-            THRESHOLD(tmp2, d4, threshold[4 * 8]);
-            THRESHOLD(tmp3, d6, threshold[6 * 8]);
-            tmp0 += 2;
-            tmp10 = (tmp0 + tmp2) >> 2;
-            tmp11 = (tmp0 - tmp2) >> 2;
-
-            tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
-            tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; 
//<<2
-
-            tmp0 = tmp10 + tmp13; //->temps
-            tmp3 = tmp10 - tmp13; //->temps
-            tmp1 = tmp11 + tmp12; //->temps
-            tmp2 = tmp11 - tmp12; //->temps
-
-            // Odd part of FDCT
-
-            tmp10 = tmp4 + tmp5;
-            tmp11 = tmp5 + tmp6;
-            tmp12 = tmp6 + tmp7;
-
-            z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
-            z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
-            z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
-            z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
-
-            z11 = tmp7 + z3;
-            z13 = tmp7 - z3;
-
-            d5 = z13 + z2;
-            d3 = z13 - z2;
-            d1 = z11 + z4;
-            d7 = z11 - z4;
-
-            // Odd part of IDCT
-
-            THRESHOLD(tmp4, d1, threshold[1 * 8]);
-            THRESHOLD(tmp5, d3, threshold[3 * 8]);
-            THRESHOLD(tmp6, d5, threshold[5 * 8]);
-            THRESHOLD(tmp7, d7, threshold[7 * 8]);
-
-            //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
-            z13 = tmp6 + tmp5;
-            z10 = (tmp6 - tmp5) << 1;
-            z11 = tmp4 + tmp7;
-            z12 = (tmp4 - tmp7) << 1;
-
-            tmp7  = (z11 + z13) >> 2; //+2 !
-            tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
-            z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
-            tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
-            tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - 
!!
-
-            tmp6 = tmp12 - tmp7;
-            tmp5 = tmp11 - tmp6;
-            tmp4 = tmp10 + tmp5;
-
-            wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
-            wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
-            wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
-            wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
-            wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
-            wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
-            wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
-            wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
-            //
-            dataptr++; //next column
-            wsptr++;
-            threshold++;
-        }
-        dataptr += 8; //skip each second start pos
-        wsptr   += 8;
-    }
-}
-
-static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt)
-{
-    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    int_simd16_t tmp10, tmp11, tmp12, tmp13;
-    int_simd16_t z5, z10, z11, z12, z13;
-    int16_t *outptr;
-    int16_t *wsptr;
-
-    cnt *= 4;
-    wsptr = workspace;
-    outptr = output_adr;
-    for (; cnt > 0; cnt--) {
-        // Even part
-        //Simd version reads 4x4 block and transposes it
-        tmp10 = wsptr[2] +  wsptr[3];
-        tmp11 = wsptr[2] -  wsptr[3];
-
-        tmp13 = wsptr[0] +  wsptr[1];
-        tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - 
tmp13;//this shift order to avoid overflow
-
-        tmp0 = tmp10 + tmp13; //->temps
-        tmp3 = tmp10 - tmp13; //->temps
-        tmp1 = tmp11 + tmp12;
-        tmp2 = tmp11 - tmp12;
-
-        // Odd part
-        //Also transpose, with previous:
-        // ---- ----      ||||
-        // ---- ---- idct ||||
-        // ---- ---- ---> ||||
-        // ---- ----      ||||
-        z13 = wsptr[4] + wsptr[5];
-        z10 = wsptr[4] - wsptr[5];
-        z11 = wsptr[6] + wsptr[7];
-        z12 = wsptr[6] - wsptr[7];
-
-        tmp7 = z11 + z13;
-        tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
-
-        z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
-        tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
-        tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
-
-        tmp6 = (tmp12 << 3) - tmp7;
-        tmp5 = (tmp11 << 3) - tmp6;
-        tmp4 = (tmp10 << 3) + tmp5;
-
-        // Final output stage: descale and write column
-        outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
-        outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
-        outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
-        outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
-        outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
-        outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
-        outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
-        outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
-        outptr++;
-
-        wsptr += DCTSIZE;       // advance pointer to next row
-    }
-}
-
-static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t 
line_size, int cnt)
-{
-    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    int_simd16_t tmp10, tmp11, tmp12, tmp13;
-    int_simd16_t z1, z2, z3, z4, z5, z11, z13;
-    int16_t *dataptr;
-
-    cnt *= 4;
-    // Pass 1: process rows.
-
-    dataptr = data;
-    for (; cnt > 0; cnt--) {
-        tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
-        tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
-        tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
-        tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
-        tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
-        tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
-        tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
-        tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
-
-        // Even part
-
-        tmp10 = tmp0 + tmp3;
-        tmp13 = tmp0 - tmp3;
-        tmp11 = tmp1 + tmp2;
-        tmp12 = tmp1 - tmp2;
-        //Even columns are written first, this leads to different order of 
columns
-        //in column_fidct(), but they are processed independently, so all ok.
-        //Later in the row_idct() columns are read in the same order.
-        dataptr[2] = tmp10 + tmp11;
-        dataptr[3] = tmp10 - tmp11;
-
-        z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
-        dataptr[0] = tmp13 + z1;
-        dataptr[1] = tmp13 - z1;
-
-        // Odd part
-
-        tmp10 = (tmp4 + tmp5) << 2;
-        tmp11 = (tmp5 + tmp6) << 2;
-        tmp12 = (tmp6 + tmp7) << 2;
-
-        z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
-        z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
-        z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
-        z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
-
-        z11 = tmp7 + z3;
-        z13 = tmp7 - z3;
-
-        dataptr[4] = z13 + z2;
-        dataptr[5] = z13 - z2;
-        dataptr[6] = z11 + z4;
-        dataptr[7] = z11 - z4;
-
-        pixels++;               // advance pointer to next column
-        dataptr += DCTSIZE;
-    }
-}
-
 static const enum AVPixelFormat pix_fmts[] = {
     AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
     AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
@@ -522,16 +226,7 @@ static int config_input(AVFilterLink *inlink)
     if (!fspp->temp || !fspp->src)
         return AVERROR(ENOMEM);
 
-    fspp->store_slice  = store_slice_c;
-    fspp->store_slice2 = store_slice2_c;
-    fspp->mul_thrmat   = mul_thrmat_c;
-    fspp->column_fidct = column_fidct_c;
-    fspp->row_idct     = row_idct_c;
-    fspp->row_fdct     = row_fdct_c;
-
-#if ARCH_X86
-    ff_fspp_init_x86(fspp);
-#endif
+    ff_fsppdsp_init(&fspp->dsp);
 
     return 0;
 }
@@ -567,7 +262,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     }
 
     if (fspp->qp)
-        fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t 
*)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), 
fspp->qp);
+        fspp->prev_q = fspp->qp, fspp->dsp.mul_thrmat((int16_t 
*)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), 
fspp->qp);
 
     /* if we are not in a constant user quantizer mode and we don't want to use
      * the quantizers from the B-frames (B-frames often have a higher QP), we
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
new file mode 100644
index 0000000000..ab31c77203
--- /dev/null
+++ b/libavfilter/vf_fsppdsp.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <[email protected]>
+ * Copyright (C) 2005 Nikolaj Poroshin <[email protected]>
+ * Copyright (c) 2014 Arwa Arif <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdint.h>
+
+#include "vf_fsppdsp.h"
+
+#include "libavutil/mathematics.h"
+#include "libavutil/mem_internal.h"
+
+#define DCTSIZE 8
+
+#define FIX(x,s)  ((x) * (1 << s) + 0.5)
+
+#define MULTIPLY16H(x,k)   (((x) * (k)) >> 16)
+#define THRESHOLD(r,x,t)                         \
+    if(((unsigned)((x) + t)) > t * 2) r = (x);   \
+    else r = 0;
+#define DESCALE(x,n)  (((x) + (1 << ((n) - 1))) >> n)
+
+typedef int32_t int_simd16_t;
+static const int16_t FIX_0_382683433   = FIX(0.382683433, 14);
+static const int16_t FIX_0_541196100   = FIX(0.541196100, 14);
+static const int16_t FIX_0_707106781   = FIX(M_SQRT1_2  , 14);
+static const int16_t FIX_1_306562965   = FIX(1.306562965, 14);
+static const int16_t FIX_1_414213562_A = FIX(M_SQRT2    , 14);
+static const int16_t FIX_1_847759065   = FIX(1.847759065, 13);
+static const int16_t FIX_2_613125930   = FIX(-2.613125930, 13);
+static const int16_t FIX_1_414213562   = FIX(M_SQRT2    , 13);
+static const int16_t FIX_1_082392200   = FIX(1.082392200, 13);
+
+DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
+    {  0,  48,  12,  60,   3,  51,  15,  63, },
+    { 32,  16,  44,  28,  35,  19,  47,  31, },
+    {  8,  56,   4,  52,  11,  59,   7,  55, },
+    { 40,  24,  36,  20,  43,  27,  39,  23, },
+    {  2,  50,  14,  62,   1,  49,  13,  61, },
+    { 34,  18,  46,  30,  33,  17,  45,  29, },
+    { 10,  58,   6,  54,   9,  57,   5,  53, },
+    { 42,  26,  38,  22,  41,  25,  37,  21, },
+};
+
+//This func reads from 1 slice, 1 and clears 0 & 1
+void ff_store_slice_c(uint8_t *dst, int16_t *src,
+                      ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                      ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+{
+#define STORE(pos)                                                             
\
+    temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        
\
+    src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          
\
+    if (temp & 0x100) temp = ~(temp >> 31);                                    
\
+    dst[x + pos] = temp;
+
+    for (int y = 0; y < height; y++) {
+        const uint8_t *d = dither[y];
+        for (int x = 0; x < width; x += 8) {
+            int temp;
+            STORE(0);
+            STORE(1);
+            STORE(2);
+            STORE(3);
+            STORE(4);
+            STORE(5);
+            STORE(6);
+            STORE(7);
+        }
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+//This func reads from 2 slices, 0 & 2  and clears 2-nd
+void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+{
+#define STORE2(pos)                                                            
                           \
+    temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> 
log2_scale)) >> (6 - log2_scale);  \
+    src[x + pos + 16 * src_stride] = 0;                                        
                           \
+    if (temp & 0x100) temp = ~(temp >> 31);                                    
                           \
+    dst[x + pos] = temp;
+
+    for (int y = 0; y < height; y++) {
+        const uint8_t *d = dither[y];
+        for (int x = 0; x < width; x += 8) {
+            int temp;
+            STORE2(0);
+            STORE2(1);
+            STORE2(2);
+            STORE2(3);
+            STORE2(4);
+            STORE2(5);
+            STORE2(6);
+            STORE2(7);
+        }
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
+{
+    for (int a = 0; a < 64; a++)
+        thr_adr[a] = q * thr_adr_noq[a];
+}
+
+void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt)
+{
+    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int_simd16_t tmp10, tmp11, tmp12, tmp13;
+    int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
+    int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
+
+    int16_t *dataptr;
+    int16_t *wsptr;
+    int16_t *threshold;
+
+    dataptr = data;
+    wsptr = output;
+
+    for (; cnt > 0; cnt -= 2) { //start positions
+        threshold = (int16_t *)thr_adr;//threshold_mtx
+        for (int ctr = DCTSIZE; ctr > 0; ctr--) {
+            // Process columns from input, add to output.
+            tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+            tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+
+            tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+            tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+
+            tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+            tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+
+            tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+            tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+
+            // Even part of FDCT
+
+            tmp10 = tmp0 + tmp3;
+            tmp13 = tmp0 - tmp3;
+            tmp11 = tmp1 + tmp2;
+            tmp12 = tmp1 - tmp2;
+
+            d0 = tmp10 + tmp11;
+            d4 = tmp10 - tmp11;
+
+            z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+            d2 = tmp13 + z1;
+            d6 = tmp13 - z1;
+
+            // Even part of IDCT
+
+            THRESHOLD(tmp0, d0, threshold[0 * 8]);
+            THRESHOLD(tmp1, d2, threshold[2 * 8]);
+            THRESHOLD(tmp2, d4, threshold[4 * 8]);
+            THRESHOLD(tmp3, d6, threshold[6 * 8]);
+            tmp0 += 2;
+            tmp10 = (tmp0 + tmp2) >> 2;
+            tmp11 = (tmp0 - tmp2) >> 2;
+
+            tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
+            tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; 
//<<2
+
+            tmp0 = tmp10 + tmp13; //->temps
+            tmp3 = tmp10 - tmp13; //->temps
+            tmp1 = tmp11 + tmp12; //->temps
+            tmp2 = tmp11 - tmp12; //->temps
+
+            // Odd part of FDCT
+
+            tmp10 = tmp4 + tmp5;
+            tmp11 = tmp5 + tmp6;
+            tmp12 = tmp6 + tmp7;
+
+            z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
+            z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
+            z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
+            z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
+
+            z11 = tmp7 + z3;
+            z13 = tmp7 - z3;
+
+            d5 = z13 + z2;
+            d3 = z13 - z2;
+            d1 = z11 + z4;
+            d7 = z11 - z4;
+
+            // Odd part of IDCT
+
+            THRESHOLD(tmp4, d1, threshold[1 * 8]);
+            THRESHOLD(tmp5, d3, threshold[3 * 8]);
+            THRESHOLD(tmp6, d5, threshold[5 * 8]);
+            THRESHOLD(tmp7, d7, threshold[7 * 8]);
+
+            //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
+            z13 = tmp6 + tmp5;
+            z10 = (tmp6 - tmp5) << 1;
+            z11 = tmp4 + tmp7;
+            z12 = (tmp4 - tmp7) << 1;
+
+            tmp7  = (z11 + z13) >> 2; //+2 !
+            tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
+            z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
+            tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
+            tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - 
!!
+
+            tmp6 = tmp12 - tmp7;
+            tmp5 = tmp11 - tmp6;
+            tmp4 = tmp10 + tmp5;
+
+            wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
+            wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
+            wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
+            wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
+            wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
+            wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
+            wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
+            wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
+            //
+            dataptr++; //next column
+            wsptr++;
+            threshold++;
+        }
+        dataptr += 8; //skip each second start pos
+        wsptr   += 8;
+    }
+}
+
+void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt)
+{
+    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int_simd16_t tmp10, tmp11, tmp12, tmp13;
+    int_simd16_t z5, z10, z11, z12, z13;
+    int16_t *outptr;
+    int16_t *wsptr;
+
+    cnt *= 4;
+    wsptr = workspace;
+    outptr = output_adr;
+    for (; cnt > 0; cnt--) {
+        // Even part
+        //Simd version reads 4x4 block and transposes it
+        tmp10 = wsptr[2] +  wsptr[3];
+        tmp11 = wsptr[2] -  wsptr[3];
+
+        tmp13 = wsptr[0] +  wsptr[1];
+        tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - 
tmp13;//this shift order to avoid overflow
+
+        tmp0 = tmp10 + tmp13; //->temps
+        tmp3 = tmp10 - tmp13; //->temps
+        tmp1 = tmp11 + tmp12;
+        tmp2 = tmp11 - tmp12;
+
+        // Odd part
+        //Also transpose, with previous:
+        // ---- ----      ||||
+        // ---- ---- idct ||||
+        // ---- ---- ---> ||||
+        // ---- ----      ||||
+        z13 = wsptr[4] + wsptr[5];
+        z10 = wsptr[4] - wsptr[5];
+        z11 = wsptr[6] + wsptr[7];
+        z12 = wsptr[6] - wsptr[7];
+
+        tmp7 = z11 + z13;
+        tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
+
+        z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
+        tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
+        tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
+
+        tmp6 = (tmp12 << 3) - tmp7;
+        tmp5 = (tmp11 << 3) - tmp6;
+        tmp4 = (tmp10 << 3) + tmp5;
+
+        // Final output stage: descale and write column
+        outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
+        outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
+        outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
+        outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
+        outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
+        outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
+        outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
+        outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
+        outptr++;
+
+        wsptr += DCTSIZE;       // advance pointer to next row
+    }
+}
+
+void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, 
int cnt)
+{
+    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int_simd16_t tmp10, tmp11, tmp12, tmp13;
+    int_simd16_t z1, z2, z3, z4, z5, z11, z13;
+    int16_t *dataptr;
+
+    cnt *= 4;
+    // Pass 1: process rows.
+
+    dataptr = data;
+    for (; cnt > 0; cnt--) {
+        tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
+        tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
+        tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
+        tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
+        tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
+        tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
+        tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
+        tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
+
+        // Even part
+
+        tmp10 = tmp0 + tmp3;
+        tmp13 = tmp0 - tmp3;
+        tmp11 = tmp1 + tmp2;
+        tmp12 = tmp1 - tmp2;
+        //Even columns are written first, this leads to different order of 
columns
+        //in column_fidct(), but they are processed independently, so all ok.
+        //Later in the row_idct() columns are read in the same order.
+        dataptr[2] = tmp10 + tmp11;
+        dataptr[3] = tmp10 - tmp11;
+
+        z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+        dataptr[0] = tmp13 + z1;
+        dataptr[1] = tmp13 - z1;
+
+        // Odd part
+
+        tmp10 = (tmp4 + tmp5) << 2;
+        tmp11 = (tmp5 + tmp6) << 2;
+        tmp12 = (tmp6 + tmp7) << 2;
+
+        z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
+        z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
+        z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
+        z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
+
+        z11 = tmp7 + z3;
+        z13 = tmp7 - z3;
+
+        dataptr[4] = z13 + z2;
+        dataptr[5] = z13 - z2;
+        dataptr[6] = z11 + z4;
+        dataptr[7] = z11 - z4;
+
+        pixels++;               // advance pointer to next column
+        dataptr += DCTSIZE;
+    }
+}
diff --git a/libavfilter/vf_fspp.h b/libavfilter/vf_fsppdsp.h
similarity index 52%
rename from libavfilter/vf_fspp.h
rename to libavfilter/vf_fsppdsp.h
index ee7de3ffef..c441b75094 100644
--- a/libavfilter/vf_fspp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -20,56 +20,17 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
-#ifndef AVFILTER_FSPP_H
-#define AVFILTER_FSPP_H
+#ifndef AVFILTER_FSPPDSP_H
+#define AVFILTER_FSPPDSP_H
 
-#include "libavutil/video_enc_params.h"
-#include "avfilter.h"
+#include <stddef.h>
+#include <stdint.h>
 
-#define BLOCKSZ 12
-#define MAX_LEVEL 5
+#include "config.h"
 
-#define DCTSIZE 8
-#define DCTSIZE_S "8"
-
-#define FIX(x,s)  ((x) * (1 << s) + 0.5)
-
-#define MULTIPLY16H(x,k)   (((x) * (k)) >> 16)
-#define THRESHOLD(r,x,t)                         \
-    if(((unsigned)((x) + t)) > t * 2) r = (x);   \
-    else r = 0;
-#define DESCALE(x,n)  (((x) + (1 << ((n) - 1))) >> n)
-
-typedef int32_t int_simd16_t;
-static const int16_t FIX_0_382683433   = FIX(0.382683433, 14);
-static const int16_t FIX_0_541196100   = FIX(0.541196100, 14);
-static const int16_t FIX_0_707106781   = FIX(M_SQRT1_2  , 14);
-static const int16_t FIX_1_306562965   = FIX(1.306562965, 14);
-static const int16_t FIX_1_414213562_A = FIX(M_SQRT2    , 14);
-static const int16_t FIX_1_847759065   = FIX(1.847759065, 13);
-static const int16_t FIX_2_613125930   = FIX(-2.613125930, 13);
-static const int16_t FIX_1_414213562   = FIX(M_SQRT2    , 13);
-static const int16_t FIX_1_082392200   = FIX(1.082392200, 13);
-
-typedef struct FSPPContext {
-    AVClass *class;
-    uint64_t threshold_mtx_noq[8 * 2];
-    uint64_t threshold_mtx[8 * 2];        //used in both C & MMX (& later 
SSE2) versions
-
-    int log2_count;
-    int strength;
-    int hsub;
-    int vsub;
-    int temp_stride;
-    int qp;
-    enum AVVideoEncParamsType qscale_type;
-    int prev_q;
-    uint8_t *src;
-    int16_t *temp;
-    int8_t  *non_b_qp_table;
-    int non_b_qp_stride;
-    int use_bframe_qp;
+#include "libavutil/attributes_internal.h"
 
+typedef struct FSPPDSPContext {
     void (*store_slice)(uint8_t *dst, int16_t *src,
                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
@@ -88,9 +49,35 @@ typedef struct FSPPContext {
 
     void (*row_fdct)(int16_t *data, const uint8_t *pixels,
                      ptrdiff_t line_size, int cnt);
-
-} FSPPContext;
-
-void ff_fspp_init_x86(FSPPContext *fspp);
-
-#endif /* AVFILTER_FSPP_H */
+} FSPPDSPContext;
+
+FF_VISIBILITY_PUSH_HIDDEN
+void ff_store_slice_c(uint8_t *dst, int16_t *src,
+                      ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                      ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
+void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt);
+void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
+void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, 
int cnt);
+
+void ff_fsppdsp_init_x86(FSPPDSPContext *fspp);
+FF_VISIBILITY_POP_HIDDEN
+
+static inline void ff_fsppdsp_init(FSPPDSPContext *fspp)
+{
+    fspp->store_slice  = ff_store_slice_c;
+    fspp->store_slice2 = ff_store_slice2_c;
+    fspp->mul_thrmat   = ff_mul_thrmat_c;
+    fspp->column_fidct = ff_column_fidct_c;
+    fspp->row_idct     = ff_row_idct_c;
+    fspp->row_fdct     = ff_row_fdct_c;
+
+#if ARCH_X86
+    ff_fsppdsp_init_x86(fspp);
+#endif
+}
+
+#endif /* AVFILTER_FSPPDSP_H */
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 8e00317cb7..2aadb50967 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -21,7 +21,7 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/x86/cpu.h"
-#include "libavfilter/vf_fspp.h"
+#include "libavfilter/vf_fsppdsp.h"
 
 void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
@@ -34,7 +34,7 @@ void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, 
int16_t *output, int c
 void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
 void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t 
line_size, int cnt);
 
-av_cold void ff_fspp_init_x86(FSPPContext *s)
+av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 

-----------------------------------------------------------------------

Summary of changes:
 libavfilter/Makefile           |   2 +-
 libavfilter/vf_fspp.c          | 450 +++++++---------------------------------
 libavfilter/vf_fspp.h          |  96 ---------
 libavfilter/vf_fsppdsp.c       | 371 +++++++++++++++++++++++++++++++++
 libavfilter/vf_fsppdsp.h       |  89 ++++++++
 libavfilter/x86/vf_fspp.asm    | 452 ++++++++++++++++++++---------------------
 libavfilter/x86/vf_fspp_init.c |  28 +--
 libavfilter/x86/vf_spp.c       |   2 +-
 tests/checkasm/Makefile        |   1 +
 tests/checkasm/checkasm.c      |   3 +
 tests/checkasm/checkasm.h      |   1 +
 tests/checkasm/vf_fspp.c       | 170 ++++++++++++++++
 tests/fate/checkasm.mak        |   1 +
 13 files changed, 937 insertions(+), 729 deletions(-)
 delete mode 100644 libavfilter/vf_fspp.h
 create mode 100644 libavfilter/vf_fsppdsp.c
 create mode 100644 libavfilter/vf_fsppdsp.h
 create mode 100644 tests/checkasm/vf_fspp.c


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to