The branch, master has been updated
via ddf443f1e99c94b5e3569904027eba868691b86b (commit)
via f8bcea4946d2ce1f5271a550fcc9131797505eed (commit)
via 0c556a6b09b43a977e12cab346a55370bb09dd05 (commit)
via 778ff97efa0f4622136de73cef3b19ed1bcc082a (commit)
via f4421457291c1ffa8db328c806d73b844b3e3450 (commit)
via c0648b200497f56b611f0be86871269aa073c90b (commit)
via 06b0dae51bce088ec304771e25ab02c3846e169d (commit)
via cc97f1e276241d93139b3ac46a358b7b9f77066b (commit)
via 3cd452cbf15459334e52c7f2aa92654c822732d5 (commit)
via ddd74276f85b3c53809d59ffc640b7b45f5a125f (commit)
via 68b11cde8212b8ea0309ef6d11b01c782fa0b943 (commit)
via 63493bf0e0909e701b64392be419f69491b8cbf1 (commit)
via 66af18d06a3faf9f8960ad6bd5a400701a0cdaab (commit)
via 1049a5fba8f9437b94050105be8d32545675315e (commit)
via d19050a1ae90b4ad8e9e2dadc5c8ca0c39301d69 (commit)
via ff85a20b7db4d3226ada8533b181989944f30e75 (commit)
via 570f8fc6c9850edf6c05d58dea0629f162199f20 (commit)
via e042f17e9947779e3b1b981218370472940ca3c6 (commit)
via 52ba2ac7bd48d09d1f8527376970e2b0e8ee5068 (commit)
via 70eb8a76a91e9c9fe3a6c0b4f1c2ff28f5447086 (commit)
via 9f4d5d818d709788ab6b199a634a95a2bfcd4898 (commit)
via 1699de09551da5efe413637fcb4c90bcaea31b4c (commit)
via 9b34088c4dfec112170a0a0102acb3be1d77d240 (commit)
from 57d6898730836ac2006d10bf18396752de092e49 (commit)
- Log -----------------------------------------------------------------
commit ddf443f1e99c94b5e3569904027eba868691b86b
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Nov 13 11:57:02 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100
avfilter/vf_fsppdsp: Fix left shifts of negative numbers
They are undefined behavior and UBSan warns about them
(in the checkasm test). Put the shifts in the constants
instead. This even gives a tiny speedup here:
Old benchmarks:
column_fidct_c: 3369.9 ( 1.00x)
column_fidct_sse2: 829.1 ( 4.06x)
New benchmarks:
column_fidct_c: 3304.2 ( 1.00x)
column_fidct_sse2: 827.9 ( 3.99x)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index 3230376a19..8025e87366 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -165,7 +165,7 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr,
const int16_t *restrict
d0 = tmp10 + tmp11;
d4 = tmp10 - tmp11;
- z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+ z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
d2 = tmp13 + z1;
d6 = tmp13 - z1;
@@ -193,10 +193,10 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr,
const int16_t *restrict
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
- z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
- z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
- z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
- z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
+ z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
+ z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5;
+ z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5;
+ z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2);
z11 = tmp7 + z3;
z13 = tmp7 - z3;
@@ -215,15 +215,15 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr,
const int16_t *restrict
//Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
z13 = tmp6 + tmp5;
- z10 = (tmp6 - tmp5) << 1;
+ z10 = (tmp6 - tmp5) * 2;
z11 = tmp4 + tmp7;
- z12 = (tmp4 - tmp7) << 1;
+ z12 = (tmp4 - tmp7) * 2;
tmp7 = (z11 + z13) >> 2; //+2 !
- tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
- z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
- tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
- tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // -
!!
+ tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562 << 1);
+ z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
+ tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
+ tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
tmp6 = tmp12 - tmp7;
tmp5 = tmp11 - tmp6;
@@ -264,7 +264,7 @@ void ff_row_idct_c(const int16_t *restrict wsptr, int16_t
*restrict output_adr,
tmp11 = wsptr[2] - wsptr[3];
tmp13 = wsptr[0] + wsptr[1];
- tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) -
tmp13;//this shift order to avoid overflow
+ tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) * 4) -
tmp13;//this shift order to avoid overflow
tmp0 = tmp10 + tmp13; //->temps
tmp3 = tmp10 - tmp13; //->temps
@@ -289,9 +289,9 @@ void ff_row_idct_c(const int16_t *restrict wsptr, int16_t
*restrict output_adr,
tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
- tmp6 = (tmp12 << 3) - tmp7;
- tmp5 = (tmp11 << 3) - tmp6;
- tmp4 = (tmp10 << 3) + tmp5;
+ tmp6 = tmp12 * 8 - tmp7;
+ tmp5 = tmp11 * 8 - tmp6;
+ tmp4 = tmp10 * 8 + tmp5;
// Final output stage: descale and write column
outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
@@ -342,20 +342,20 @@ void ff_row_fdct_c(int16_t *restrict data, const uint8_t
*restrict pixels,
dataptr[2] = tmp10 + tmp11;
dataptr[3] = tmp10 - tmp11;
- z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+ z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
dataptr[0] = tmp13 + z1;
dataptr[1] = tmp13 - z1;
// Odd part
- tmp10 = (tmp4 + tmp5) << 2;
- tmp11 = (tmp5 + tmp6) << 2;
- tmp12 = (tmp6 + tmp7) << 2;
+ tmp10 = tmp4 + tmp5;
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
- z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
- z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
- z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
- z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
+ z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
+ z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5;
+ z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5;
+ z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2);
z11 = tmp7 + z3;
z13 = tmp7 - z3;
commit f8bcea4946d2ce1f5271a550fcc9131797505eed
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Nov 13 12:04:15 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100
avfilter/vf_fsppdsp: Remove pointless cast
Also don't cast const away and use a smaller scope.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index 7fdc5ece25..3230376a19 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -136,12 +136,11 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr,
const int16_t *restrict
int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
int16_t *wsptr;
- int16_t *threshold;
wsptr = output;
for (; cnt > 0; cnt -= 2) { //start positions
- threshold = (int16_t *)thr_adr;//threshold_mtx
+ const int16_t *threshold = thr_adr;//threshold_mtx
for (int ctr = DCTSIZE; ctr > 0; ctr--) {
// Process columns from input, add to output.
tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7];
commit 0c556a6b09b43a977e12cab346a55370bb09dd05
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Nov 13 11:18:28 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100
avfilter/vf_fspp: Pre-reorder threshold table
Avoids reordering at runtime.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 9e5c688fb2..cbf2e06d67 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -92,14 +92,16 @@ static const short custom_threshold[64] = {
// values (296) can't be too high
// -it causes too big quant dependence
// or maybe overflow(check), which results in some flashing
- 71, 296, 295, 237, 71, 40, 38, 19,
- 245, 193, 185, 121, 102, 73, 53, 27,
- 158, 129, 141, 107, 97, 73, 50, 26,
- 102, 116, 109, 98, 82, 66, 45, 23,
- 71, 94, 95, 81, 70, 56, 38, 20,
- 56, 77, 74, 66, 56, 44, 30, 15,
- 38, 53, 50, 45, 38, 30, 21, 11,
- 20, 27, 26, 23, 20, 15, 11, 5
+// reorder coefficients to the order in which columns are processed
+#define REORDER(a,b,c,d,e,f,g,h) c, g, a, e, f, d, b, h
+ REORDER( 71, 296, 295, 237, 71, 40, 38, 19),
+ REORDER(245, 193, 185, 121, 102, 73, 53, 27),
+ REORDER(158, 129, 141, 107, 97, 73, 50, 26),
+ REORDER(102, 116, 109, 98, 82, 66, 45, 23),
+ REORDER( 71, 94, 95, 81, 70, 56, 38, 20),
+ REORDER( 56, 77, 74, 66, 56, 44, 30, 15),
+ REORDER( 38, 53, 50, 45, 38, 30, 21, 11),
+ REORDER( 20, 27, 26, 23, 20, 15, 11, 5)
};
static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
@@ -244,25 +246,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
int qp_stride = 0;
int8_t *qp_table = NULL;
- int i, bias;
int ret = 0;
- int custom_threshold_m[64];
-
- bias = (1 << 4) + fspp->strength;
-
- for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
- custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) +
0.5);
-
- for (int i = 0; i < 64; i += 8) {
- fspp->threshold_mtx_noq[i + 0] = custom_threshold_m[i + 2];
- fspp->threshold_mtx_noq[i + 1] = custom_threshold_m[i + 6];
- fspp->threshold_mtx_noq[i + 2] = custom_threshold_m[i + 0];
- fspp->threshold_mtx_noq[i + 3] = custom_threshold_m[i + 4];
- fspp->threshold_mtx_noq[i + 4] = custom_threshold_m[i + 5];
- fspp->threshold_mtx_noq[i + 5] = custom_threshold_m[i + 3];
- fspp->threshold_mtx_noq[i + 6] = custom_threshold_m[i + 1];
- fspp->threshold_mtx_noq[i + 7] = custom_threshold_m[i + 7];
- }
+
+ //FIXME: tune custom_threshold[] and remove this !
+ for (int i = 0, bias = (1 << 4) + fspp->strength; i < 64; ++i)
+ fspp->threshold_mtx_noq[i] = (int)(custom_threshold[i] * (bias / 71.0)
+ 0.5);
if (fspp->qp) {
fspp->prev_q = fspp->qp;
commit 778ff97efa0f4622136de73cef3b19ed1bcc082a
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Nov 13 11:02:56 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100
avfilter/vf_fspp: Make output endian-independent
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 670e9288d9..9e5c688fb2 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -37,7 +37,6 @@
#include "libavutil/emms.h"
#include "libavutil/imgutils.h"
-#include "libavutil/intreadwrite.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
#include "libavutil/opt.h"
@@ -254,16 +253,15 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) +
0.5);
- for (i = 0; i < 8; i++) {
- AV_WN64A(&fspp->threshold_mtx_noq[8 * i],
(uint64_t)custom_threshold_m[i * 8 + 2]
- |(((uint64_t)custom_threshold_m[i * 8 +
6]) << 16)
- |(((uint64_t)custom_threshold_m[i * 8 +
0]) << 32)
- |(((uint64_t)custom_threshold_m[i * 8 +
4]) << 48));
-
- AV_WN64A(&fspp->threshold_mtx_noq[8 * i + 4],
(uint64_t)custom_threshold_m[i * 8 + 5]
- |(((uint64_t)custom_threshold_m[i *
8 + 3]) << 16)
- |(((uint64_t)custom_threshold_m[i *
8 + 1]) << 32)
- |(((uint64_t)custom_threshold_m[i *
8 + 7]) << 48));
+ for (int i = 0; i < 64; i += 8) {
+ fspp->threshold_mtx_noq[i + 0] = custom_threshold_m[i + 2];
+ fspp->threshold_mtx_noq[i + 1] = custom_threshold_m[i + 6];
+ fspp->threshold_mtx_noq[i + 2] = custom_threshold_m[i + 0];
+ fspp->threshold_mtx_noq[i + 3] = custom_threshold_m[i + 4];
+ fspp->threshold_mtx_noq[i + 4] = custom_threshold_m[i + 5];
+ fspp->threshold_mtx_noq[i + 5] = custom_threshold_m[i + 3];
+ fspp->threshold_mtx_noq[i + 6] = custom_threshold_m[i + 1];
+ fspp->threshold_mtx_noq[i + 7] = custom_threshold_m[i + 7];
}
if (fspp->qp) {
commit f4421457291c1ffa8db328c806d73b844b3e3450
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Nov 13 10:48:23 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100
avfilter/vf_fspp: Avoid casts, effective-type violations
Maybe uint64_t has been used as a poor man's alignment specifier?
Anyway, reading an uint64_t via an lvalue of type int16_t (as happens
in the C versions of the dsp functions) is undefined behavior.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 3db7fe114e..670e9288d9 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -37,6 +37,7 @@
#include "libavutil/emms.h"
#include "libavutil/imgutils.h"
+#include "libavutil/intreadwrite.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
#include "libavutil/opt.h"
@@ -71,8 +72,8 @@ typedef struct FSPPContext {
FSPPDSPContext dsp;
- DECLARE_ALIGNED(16, uint64_t, threshold_mtx_noq)[8 * 2];
- DECLARE_ALIGNED(16, uint64_t, threshold_mtx)[8 * 2];
+ DECLARE_ALIGNED(16, int16_t, threshold_mtx_noq)[8 * 8];
+ DECLARE_ALIGNED(16, int16_t, threshold_mtx)[8 * 8];
} FSPPContext;
@@ -154,7 +155,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t
*src,
p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 -
(y&1), stride, 2 * (BLOCKSZ - 1));
if (p->qp)
- p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block +
0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
+ p->dsp.column_fidct(p->threshold_mtx, block + 0 * 8, block3 +
0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
else
for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
t = x + x0 - 2; //correct
t=x+x0-2-(y&1), but its the same
@@ -164,8 +165,11 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t
*src,
t = qp_store[qy + (t >> qpsh)];
t = ff_norm_qscale(t, p->qscale_type);
- if (t != p->prev_q) p->prev_q = t,
p->dsp.mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t
*)(&p->threshold_mtx[0]), t);
- p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]),
block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
+ if (t != p->prev_q) {
+ p->prev_q = t;
+ p->dsp.mul_thrmat(p->threshold_mtx_noq,
p->threshold_mtx, t);
+ }
+ p->dsp.column_fidct(p->threshold_mtx, block + x * 8,
block3 + x * 8, 8); //yes, this is a HOTSPOT
}
p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 +
2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
memmove(block, block + (BLOCKSZ - 1) * 64, 8 * 8 *
sizeof(int16_t)); //cycling
@@ -176,7 +180,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t
*src,
if (es > 8)
p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 -
(y & 1), stride, (es - 4) >> 2);
- p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3,
es&(~1));
+ p->dsp.column_fidct(p->threshold_mtx, block, block3, es&(~1));
if (es > 3)
p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 +
2 - (y & 1), stride, es >> 2);
@@ -251,19 +255,21 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) +
0.5);
for (i = 0; i < 8; i++) {
- fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 +
2]
+ AV_WN64A(&fspp->threshold_mtx_noq[8 * i],
(uint64_t)custom_threshold_m[i * 8 + 2]
|(((uint64_t)custom_threshold_m[i * 8 +
6]) << 16)
|(((uint64_t)custom_threshold_m[i * 8 +
0]) << 32)
- |(((uint64_t)custom_threshold_m[i * 8 +
4]) << 48);
+ |(((uint64_t)custom_threshold_m[i * 8 +
4]) << 48));
- fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i *
8 + 5]
+ AV_WN64A(&fspp->threshold_mtx_noq[8 * i + 4],
(uint64_t)custom_threshold_m[i * 8 + 5]
|(((uint64_t)custom_threshold_m[i *
8 + 3]) << 16)
|(((uint64_t)custom_threshold_m[i *
8 + 1]) << 32)
- |(((uint64_t)custom_threshold_m[i *
8 + 7]) << 48);
+ |(((uint64_t)custom_threshold_m[i *
8 + 7]) << 48));
}
- if (fspp->qp)
- fspp->prev_q = fspp->qp, fspp->dsp.mul_thrmat((int16_t
*)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]),
fspp->qp);
+ if (fspp->qp) {
+ fspp->prev_q = fspp->qp;
+ fspp->dsp.mul_thrmat(fspp->threshold_mtx_noq, fspp->threshold_mtx,
fspp->qp);
+ }
/* if we are not in a constant user quantizer mode and we don't want to use
* the quantizers from the B-frames (B-frames often have a higher QP), we
commit c0648b200497f56b611f0be86871269aa073c90b
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 23:26:04 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100
avfilter/x86/vf_spp: Fix comment
Forgotten in dcb28ed860166c9715afb7c71c70889e6b9b8c8d.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c
index 48c3d25d7c..7dcf18ec7d 100644
--- a/libavfilter/x86/vf_spp.c
+++ b/libavfilter/x86/vf_spp.c
@@ -64,7 +64,7 @@ static void store_slice_sse2(uint8_t *dst, const int16_t *src,
}
}
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_SSE2_INLINE */
av_cold void ff_spp_init_x86(SPPContext *s)
{
commit 06b0dae51bce088ec304771e25ab02c3846e169d
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 23:15:24 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100
avfilter/vf_fsppdsp: Constify
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index e530bcd06b..7fdc5ece25 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -121,13 +121,13 @@ void ff_store_slice2_c(uint8_t *restrict dst, int16_t
*restrict src,
}
}
-void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr,
int q)
+void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict
thr_adr, int q)
{
for (int a = 0; a < 64; a++)
thr_adr[a] = q * thr_adr_noq[a];
}
-void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t
*restrict data,
int16_t *restrict output, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
@@ -135,28 +135,26 @@ void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t
*restrict data,
int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
- int16_t *dataptr;
int16_t *wsptr;
int16_t *threshold;
- dataptr = data;
wsptr = output;
for (; cnt > 0; cnt -= 2) { //start positions
threshold = (int16_t *)thr_adr;//threshold_mtx
for (int ctr = DCTSIZE; ctr > 0; ctr--) {
// Process columns from input, add to output.
- tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
- tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+ tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7];
+ tmp7 = data[DCTSIZE * 0] - data[DCTSIZE * 7];
- tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
- tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+ tmp1 = data[DCTSIZE * 1] + data[DCTSIZE * 6];
+ tmp6 = data[DCTSIZE * 1] - data[DCTSIZE * 6];
- tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
- tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+ tmp2 = data[DCTSIZE * 2] + data[DCTSIZE * 5];
+ tmp5 = data[DCTSIZE * 2] - data[DCTSIZE * 5];
- tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
- tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+ tmp3 = data[DCTSIZE * 3] + data[DCTSIZE * 4];
+ tmp4 = data[DCTSIZE * 3] - data[DCTSIZE * 4];
// Even part of FDCT
@@ -241,26 +239,24 @@ void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t
*restrict data,
wsptr[DCTSIZE * 6] = (tmp1 - tmp6);
wsptr[DCTSIZE * 7] = (tmp0 - tmp7);
//
- dataptr++; //next column
+ data++; //next column
wsptr++;
threshold++;
}
- dataptr += 8; //skip each second start pos
+ data += 8; //skip each second start pos
wsptr += 8;
}
}
-void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr,
ptrdiff_t output_stride, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
int_simd16_t z5, z10, z11, z12, z13;
int16_t *outptr;
- int16_t *wsptr;
cnt *= 4;
- wsptr = workspace;
outptr = output_adr;
for (; cnt > 0; cnt--) {
// Even part
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index 66030da4b1..5a2f1af030 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -39,13 +39,13 @@ typedef struct FSPPDSPContext {
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
- void (*mul_thrmat)(int16_t *restrict thr_adr_noq /* align 16 */,
+ void (*mul_thrmat)(const int16_t *restrict thr_adr_noq /* align 16 */,
int16_t *restrict thr_adr /* align 16 */, int q);
- void (*column_fidct)(int16_t *restrict thr_adr, int16_t *data,
+ void (*column_fidct)(const int16_t *restrict thr_adr, const int16_t
*restrict data,
int16_t *restrict output, int cnt);
- void (*row_idct)(int16_t *restrict workspace, int16_t *restrict output_adr,
+ void (*row_idct)(const int16_t *restrict workspace, int16_t *restrict
output_adr,
ptrdiff_t output_stride, int cnt);
void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels,
@@ -61,10 +61,10 @@ void ff_store_slice_c(uint8_t *restrict dst, int16_t
*restrict src,
void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
-void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr,
int q);
-void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict
thr_adr, int q);
+void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t
*restrict data,
int16_t *restrict output, int cnt);
-void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+void ff_row_idct_c(const int16_t *restrict workspace, int16_t *restrict
output_adr,
ptrdiff_t output_stride, int cnt);
void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
ptrdiff_t line_size, int cnt);
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index c7a9b1799e..caf94b30d6 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -29,9 +29,9 @@ void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
-void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output,
int cnt);
-void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t
output_stride, int cnt);
+void ff_mul_thrmat_sse2(const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_sse2(const int16_t *thr_adr, const int16_t *data, int16_t
*output, int cnt);
+void ff_row_idct_mmx(const int16_t *workspace, int16_t *output_adr, ptrdiff_t
output_stride, int cnt);
void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t
line_size, int cnt);
av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index ccb9cd1e7d..496e859fe0 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -116,7 +116,7 @@ static void check_mul_thrmat(void)
DECLARE_ALIGNED(16, int16_t, dst_ref)[64];
DECLARE_ALIGNED(16, int16_t, dst_new)[64];
const int q = (uint8_t)rnd();
- declare_func(void, int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+ declare_func(void, const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
ff_fsppdsp_init(&fspp);
@@ -136,7 +136,7 @@ static void check_column_fidct(void)
NB_BLOCKS = 8, ///< arbitrary
};
FSPPDSPContext fspp;
- declare_func(void, int16_t *thr_adr, int16_t *data,
+ declare_func(void, const int16_t *thr_adr, const int16_t *data,
int16_t *output, int cnt);
ff_fsppdsp_init(&fspp);
commit cc97f1e276241d93139b3ac46a358b7b9f77066b
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 23:05:30 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100
avfilter/vf_fspp: Fix effective type violation
Also don't use unnecessarily large alignment; it avoids having to align
the stack.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index fa562cbd45..3db7fe114e 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -114,9 +114,9 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t
*src,
const int qpsh = 4 - p->hsub * !is_luma;
const int qpsv = 4 - p->vsub * !is_luma;
- DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 *
BLOCKSZ];
- int16_t *block = (int16_t *)block_align;
- int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
+ DECLARE_ALIGNED(16, int16_t, block_align)[8 * 8 * BLOCKSZ + 8 * 8 *
BLOCKSZ];
+ int16_t *block = block_align;
+ int16_t *block3 = block_align + 8 * 8 * BLOCKSZ;
memset(block3, 0, 4 * 8 * BLOCKSZ);
commit 3cd452cbf15459334e52c7f2aa92654c822732d5
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 22:44:28 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:12 2025 +0100
avfilter/x86/vf_fspp: Avoid stack on x64
Possible due to the amount of registers.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 3f37911722..cad44ed0bf 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -210,35 +210,47 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m6, m2
psubw m7, m2
mova m2, m5
+%if ARCH_X86_64
+ mova m8, [thrq]
+%define THRQ m8
+%else
+%define THRQ [thrq]
+%endif
paddw m5, m6
psubw m2, m6
paddw m7, m1
mova m6, [thrq+4*16]
psllw m7, 1
- psubw m5, [thrq]
+ psubw m5, THRQ
psubw m2, m6
- paddusw m5, [thrq]
+ paddusw m5, THRQ
paddusw m2, m6
- pmulhw m7, [pw_5A82]
- paddw m5, [thrq]
+ pmulhw m7, SQRT2
+ paddw m5, THRQ
paddw m2, m6
- psubusw m5, [thrq]
+ psubusw m5, THRQ
psubusw m2, m6
paddw m5, [pw_2]
mova m6, m2
paddw m2, m5
+%if ARCH_X86_64
+ mova m8, [thrq+2*16]
+%define THRQ m8
+%else
+%define THRQ [thrq+2*16]
+%endif
psubw m5, m6
mova m6, m1
paddw m1, m7
- psubw m1, [thrq+2*16]
+ psubw m1, THRQ
psubw m6, m7
mova m7, [thrq+6*16]
psraw m5, 2
- paddusw m1, [thrq+2*16]
+ paddusw m1, THRQ
psubw m6, m7
- paddw m1, [thrq+2*16]
+ paddw m1, THRQ
paddusw m6, m7
- psubusw m1, [thrq+2*16]
+ psubusw m1, THRQ
paddw m6, m7
psubw m3, [srcq+DCTSIZE*4*2]
psubusw m6, m7
@@ -250,15 +262,15 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m6, m7
psraw m6, 2
mova m7, m2
- pmulhw m1, [pw_5A82]
+ pmulhw m1, SQRT2
paddw m2, m6
- mova [rsp], m2
+ mova tmp0, m2
psubw m7, m6
mova m2, [srcq+DCTSIZE*2*2]
psubw m1, m6
psubw m2, [srcq+DCTSIZE*5*2]
mova m6, m5
- mova [rsp+16*3], m7
+ mova tmp3, m7
paddw m3, m2
paddw m2, m4
paddw m4, m0
@@ -272,14 +284,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
psllw m2, 1
pmulhw m4, [pw_539F]
paddw m5, m1
- pmulhw m2, [pw_5A82]
+ pmulhw m2, SQRT2
psubw m6, m1
paddw m7, m3
- mova [rsp+16], m5
+ mova tmp1, m5
paddw m4, m3
mova m3, [thrq+3*16]
mova m1, m0
- mova [rsp+16*2], m6
+ mova tmp2, m6
psubw m1, m2
paddw m0, m2
mova m5, m1
@@ -319,14 +331,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
%endif
or tmpq, tmpq
jnz %1
- mova m4, [rsp]
+ mova m4, tmp0
psraw m3, m0, 2
mova m5, [outq+DCTSIZE*0*2]
pmulhw m1, m0, [pw_7642]
pmulhw m2, m0, [pw_4546]
- pmulhw m0, [pw_5A82]
+ pmulhw m0, SQRT2
paddw m5, m4
- mova m6, [rsp+16]
+ mova m6, tmp1
psubw m2, m1
psubw m4, m3
mova m7, [outq+DCTSIZE*1*2]
@@ -337,7 +349,7 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m2, m0
mova [outq+DCTSIZE*0*2], m5
paddw m7, m6
- mova m3, [rsp+16*2]
+ mova m3, tmp2
psubw m6, m1
mova m4, [outq+DCTSIZE*2*2]
paddw m7, m1
@@ -349,7 +361,7 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m4, m0
mova m6, [outq+DCTSIZE*3*2]
paddw m5, m3
- mova m0, [rsp+16*3]
+ mova m0, tmp3
mova [outq+DCTSIZE*1*2], m7
paddw m6, m0
mova [outq+DCTSIZE*2*2], m4
@@ -376,23 +388,23 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m2, m6
pmulhw m0, [pw_4546]
mova m7, m2
- mova m4, [rsp]
+ mova m4, tmp0
psubw m2, m3
paddw m7, m3
- pmulhw m2, [pw_5A82]
+ pmulhw m2, SQRT2
mova m6, m4
psraw m7, 2
paddw m4, [outq]
psubw m6, m7
- mova m3, [rsp+16]
+ mova m3, tmp1
paddw m4, m7
mova [outq+DCTSIZE*7*2], m6
paddw m1, m5
mova [outq], m4
psubw m1, m7
- mova m7, [rsp+16*2]
+ mova m7, tmp2
psubw m0, m5
- mova m6, [rsp+16*3]
+ mova m6, tmp3
mova m5, m3
paddw m3, [outq+DCTSIZE*1*2]
psubw m5, m1
@@ -419,7 +431,21 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
%endmacro
;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output,
int cnt);
-cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp
+cglobal column_fidct, 4, 5, 8+5*ARCH_X86_64, 64*!ARCH_X86_64, thr, src, out,
cnt, tmp
+%if ARCH_X86_64
+ %define tmp0 m8
+ %define tmp1 m9
+ %define tmp2 m10
+ %define tmp3 m11
+ %define SQRT2 m12
+ mova m12, [pw_5A82]
+%else
+ %define tmp0 [rsp]
+ %define tmp1 [rsp+16]
+ %define tmp2 [rsp+2*16]
+ %define tmp3 [rsp+3*16]
+ %define SQRT2 [pw_5A82]
+%endif
.fdct:
COLUMN_FDCT .idct
sub cntd, 2
commit ddd74276f85b3c53809d59ffc640b7b45f5a125f
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 21:42:32 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:11 2025 +0100
avfilter/x86/vf_fspp: Port ff_column_fidct_mmx() to SSE2
It gains a lot because it has to operate on eight words;
it also saves 608B of .text here.
Old benchmarks:
column_fidct_c: 3365.7 ( 1.00x)
column_fidct_mmx: 1784.6 ( 1.89x)
New benchmarks:
column_fidct_c: 3361.5 ( 1.00x)
column_fidct_sse2: 801.1 ( 4.20x)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index f61efc99f8..3f37911722 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -26,18 +26,18 @@
SECTION_RODATA
cextern fspp_dither
+pw_4546: times 8 dw 0x4546 ; FIX(1.082392200, 13)*2
+pw_61F8: times 8 dw 0x61F8 ; FIX(0.382683433, 14)*4
+pw_539F: times 8 dw 0x539F ; FIX(1.306562965, 14)
+pw_5A82: times 8 dw 0x5A82 ; FIX(1.414213562, 14)
+pw_7642: times 8 dw 0x7642 ; FIX(1.847759065, 13)*2
+pw_AC62: times 8 dw 0xAC62 ; FIX(-2.613125930, 13)
+pw_2: times 8 dw 2
pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
-pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14)
pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
-pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13)
pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
-pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
-pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
-pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13)
pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
-pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
pw_4: times 4 dw 4
-pw_2: times 4 dw 2
SECTION .text
@@ -191,82 +191,83 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
mova [thrq+16*7], m3
RET
-%macro COLUMN_FDCT 1-3 0, 0
- movq m1, [srcq+DCTSIZE*0*2]
- movq m7, [srcq+DCTSIZE*3*2]
- movq m0, m1
+%macro COLUMN_FDCT 1
+ mova m1, [srcq+DCTSIZE*0*2]
+ mova m7, [srcq+DCTSIZE*3*2]
+ mova m0, m1
paddw m1, [srcq+DCTSIZE*7*2]
- movq m3, m7
+ mova m3, m7
paddw m7, [srcq+DCTSIZE*4*2]
- movq m5, m1
- movq m6, [srcq+DCTSIZE*1*2]
+ mova m5, m1
+ mova m6, [srcq+DCTSIZE*1*2]
psubw m1, m7
- movq m2, [srcq+DCTSIZE*2*2]
- movq m4, m6
+ mova m2, [srcq+DCTSIZE*2*2]
+ mova m4, m6
paddw m6, [srcq+DCTSIZE*6*2]
paddw m5, m7
paddw m2, [srcq+DCTSIZE*5*2]
- movq m7, m6
+ mova m7, m6
paddw m6, m2
psubw m7, m2
- movq m2, m5
+ mova m2, m5
paddw m5, m6
psubw m2, m6
paddw m7, m1
- movq m6, [thrq+4*16+%2]
+ mova m6, [thrq+4*16]
psllw m7, 1
- psubw m5, [thrq+%2]
+ psubw m5, [thrq]
psubw m2, m6
- paddusw m5, [thrq+%2]
+ paddusw m5, [thrq]
paddusw m2, m6
pmulhw m7, [pw_5A82]
- paddw m5, [thrq+%2]
+ paddw m5, [thrq]
paddw m2, m6
- psubusw m5, [thrq+%2]
+ psubusw m5, [thrq]
psubusw m2, m6
paddw m5, [pw_2]
- movq m6, m2
+ mova m6, m2
paddw m2, m5
psubw m5, m6
- movq m6, m1
+ mova m6, m1
paddw m1, m7
- psubw m1, [thrq+2*16+%2]
+ psubw m1, [thrq+2*16]
psubw m6, m7
- movq m7, [thrq+6*16+%2]
+ mova m7, [thrq+6*16]
psraw m5, 2
- paddusw m1, [thrq+2*16+%2]
+ paddusw m1, [thrq+2*16]
psubw m6, m7
- paddw m1, [thrq+2*16+%2]
+ paddw m1, [thrq+2*16]
paddusw m6, m7
- psubusw m1, [thrq+2*16+%2]
+ psubusw m1, [thrq+2*16]
paddw m6, m7
psubw m3, [srcq+DCTSIZE*4*2]
psubusw m6, m7
- movq m7, m1
+ mova m7, m1
psraw m2, 2
psubw m4, [srcq+DCTSIZE*6*2]
psubw m1, m6
psubw m0, [srcq+DCTSIZE*7*2]
paddw m6, m7
psraw m6, 2
- movq m7, m2
+ mova m7, m2
pmulhw m1, [pw_5A82]
paddw m2, m6
- movq [rsp], m2
+ mova [rsp], m2
psubw m7, m6
- movq m2, [srcq+DCTSIZE*2*2]
+ mova m2, [srcq+DCTSIZE*2*2]
psubw m1, m6
psubw m2, [srcq+DCTSIZE*5*2]
- movq m6, m5
- movq [rsp+8*3], m7
+ mova m6, m5
+ mova [rsp+16*3], m7
paddw m3, m2
paddw m2, m4
paddw m4, m0
- movq m7, m3
+ mova m7, m3
psubw m3, m4
psllw m7, 1
pmulhw m3, [pw_61F8]
psllw m4, 2
+ add srcq, 32
pmulhw m7, [pw_4546]
psllw m2, 1
pmulhw m4, [pw_539F]
@@ -274,25 +275,25 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
pmulhw m2, [pw_5A82]
psubw m6, m1
paddw m7, m3
- movq [rsp+8], m5
+ mova [rsp+16], m5
paddw m4, m3
- movq m3, [thrq+3*16+%2]
- movq m1, m0
- movq [rsp+8*2], m6
+ mova m3, [thrq+3*16]
+ mova m1, m0
+ mova [rsp+16*2], m6
psubw m1, m2
paddw m0, m2
- movq m5, m1
- movq m2, [thrq+5*16+%2]
+ mova m5, m1
+ mova m2, [thrq+5*16]
psubw m1, m7
paddw m5, m7
psubw m1, m3
- movq m7, [thrq+16+%2]
+ mova m7, [thrq+16]
psubw m5, m2
- movq m6, m0
+ mova m6, m0
paddw m0, m4
paddusw m1, m3
psubw m6, m4
- movq m4, [thrq+7*16+%2]
+ mova m4, [thrq+7*16]
psubw m0, m7
psubw m6, m4
paddusw m5, m2
@@ -303,27 +304,32 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
psubusw m1, m3
psubusw m5, m2
psubusw m6, m4
- movq m4, m1
+ mova m4, m1
por m4, m5
paddusw m0, m7
por m4, m6
paddw m0, m7
packssdw m4, m4
psubusw m0, m7
- movd tmpd, m4
- or tmpd, tmpd
+%if ARCH_X86_64
+ movq tmpq, m4
+%else
+ packssdw m4, m4
+ movd tmpd, m4
+%endif
+ or tmpq, tmpq
jnz %1
- movq m4, [rsp]
+ mova m4, [rsp]
psraw m3, m0, 2
mova m5, [outq+DCTSIZE*0*2]
pmulhw m1, m0, [pw_7642]
pmulhw m2, m0, [pw_4546]
pmulhw m0, [pw_5A82]
paddw m5, m4
- movq m6, [rsp+8]
+ mova m6, [rsp+16]
psubw m2, m1
psubw m4, m3
- movq m7, [outq+DCTSIZE*1*2]
+ mova m7, [outq+DCTSIZE*1*2]
paddw m5, m3
psubw m1, m3
mova [outq+DCTSIZE*7*2], m4
@@ -331,38 +337,37 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m2, m0
mova [outq+DCTSIZE*0*2], m5
paddw m7, m6
- movq m3, [rsp+8*2]
+ mova m3, [rsp+16*2]
psubw m6, m1
- movq m4, [outq+DCTSIZE*2*2]
+ mova m4, [outq+DCTSIZE*2*2]
paddw m7, m1
- movq [outq], m5
+ mova [outq], m5
paddw m4, m3
- movq [outq+DCTSIZE*6*2], m6
+ mova [outq+DCTSIZE*6*2], m6
psubw m3, m0
- movq m5, [outq+DCTSIZE*5*2]
+ mova m5, [outq+DCTSIZE*5*2]
paddw m4, m0
- movq m6, [outq+DCTSIZE*3*2]
+ mova m6, [outq+DCTSIZE*3*2]
paddw m5, m3
- movq m0, [rsp+8*3]
- add srcq, 8+%3
- movq [outq+DCTSIZE*1*2], m7
+ mova m0, [rsp+16*3]
+ mova [outq+DCTSIZE*1*2], m7
paddw m6, m0
- movq [outq+DCTSIZE*2*2], m4
+ mova [outq+DCTSIZE*2*2], m4
paddw m0, m2
- movq m7, [outq+DCTSIZE*4*2]
+ mova m7, [outq+DCTSIZE*4*2]
psubw m6, m2
- movq [outq+DCTSIZE*5*2], m5
+ mova [outq+DCTSIZE*5*2], m5
paddw m7, m0
- movq [outq+DCTSIZE*3*2], m6
- movq [outq+DCTSIZE*4*2], m7
- add outq, 8+%3
+ mova [outq+DCTSIZE*3*2], m6
+ mova [outq+DCTSIZE*4*2], m7
+ add outq, 32
%endmacro
-%macro COLUMN_IDCT 0-1 0
- movq m3, m5
+%macro COLUMN_IDCT 0
+ mova m3, m5
psubw m5, m1
paddw m3, m1
- movq m2, m0
+ mova m2, m0
psubw m0, m6
psllw m1, m5, 1
pmulhw m1, [pw_AC62]
@@ -370,72 +375,64 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
pmulhw m5, [pw_7642]
paddw m2, m6
pmulhw m0, [pw_4546]
- movq m7, m2
- movq m4, [rsp]
+ mova m7, m2
+ mova m4, [rsp]
psubw m2, m3
paddw m7, m3
pmulhw m2, [pw_5A82]
- movq m6, m4
+ mova m6, m4
psraw m7, 2
paddw m4, [outq]
psubw m6, m7
- movq m3, [rsp+8]
+ mova m3, [rsp+16]
paddw m4, m7
- movq [outq+DCTSIZE*7*2], m6
+ mova [outq+DCTSIZE*7*2], m6
paddw m1, m5
- movq [outq], m4
+ mova [outq], m4
psubw m1, m7
- movq m7, [rsp+8*2]
+ mova m7, [rsp+16*2]
psubw m0, m5
- movq m6, [rsp+8*3]
- movq m5, m3
+ mova m6, [rsp+16*3]
+ mova m5, m3
paddw m3, [outq+DCTSIZE*1*2]
psubw m5, m1
psubw m2, m1
paddw m3, m1
- movq [outq+DCTSIZE*6*2], m5
- movq m4, m7
+ mova [outq+DCTSIZE*6*2], m5
+ mova m4, m7
paddw m7, [outq+DCTSIZE*2*2]
psubw m4, m2
paddw m4, [outq+DCTSIZE*5*2]
paddw m7, m2
- movq [outq+DCTSIZE*1*2], m3
+ mova [outq+DCTSIZE*1*2], m3
paddw m0, m2
- movq [outq+DCTSIZE*2*2], m7
- movq m1, m6
+ mova [outq+DCTSIZE*2*2], m7
+ mova m1, m6
paddw m6, [outq+DCTSIZE*4*2]
psubw m1, m0
paddw m1, [outq+DCTSIZE*3*2]
paddw m6, m0
- movq [outq+DCTSIZE*5*2], m4
- add srcq, 8+%1
- movq [outq+DCTSIZE*4*2], m6
- movq [outq+DCTSIZE*3*2], m1
- add outq, 8+%1
+ mova [outq+DCTSIZE*5*2], m4
+ mova [outq+DCTSIZE*4*2], m6
+ mova [outq+DCTSIZE*3*2], m1
+ add outq, 32
%endmacro
-INIT_MMX mmx
-;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output,
int cnt);
-cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
-.fdct1:
- COLUMN_FDCT .idct1
- jmp .fdct2
-
-.idct1:
- COLUMN_IDCT
-
-.fdct2:
- COLUMN_FDCT .idct2, 8, 16
+;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output,
int cnt);
+cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp
+.fdct:
+ COLUMN_FDCT .idct
sub cntd, 2
- jg .fdct1
+ jg .fdct
RET
-.idct2:
- COLUMN_IDCT 16
+.idct:
+ COLUMN_IDCT
sub cntd, 2
- jg .fdct1
+ jg .fdct
RET
+INIT_MMX mmx
;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t
output_stride, int cnt);
cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
add strideq, strideq
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index ee875547d2..c7a9b1799e 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -30,7 +30,7 @@ void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int
cnt);
+void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output,
int cnt);
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t
output_stride, int cnt);
void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t
line_size, int cnt);
@@ -39,7 +39,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
- s->column_fidct = ff_column_fidct_mmx;
s->row_idct = ff_row_idct_mmx;
s->row_fdct = ff_row_fdct_mmx;
}
@@ -47,5 +46,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
s->store_slice = ff_store_slice_sse2;
s->store_slice2 = ff_store_slice2_sse2;
s->mul_thrmat = ff_mul_thrmat_sse2;
+ s->column_fidct = ff_column_fidct_sse2;
}
}
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index de407739d9..ccb9cd1e7d 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -136,8 +136,8 @@ static void check_column_fidct(void)
NB_BLOCKS = 8, ///< arbitrary
};
FSPPDSPContext fspp;
- declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data,
- int16_t *output, int cnt);
+ declare_func(void, int16_t *thr_adr, int16_t *data,
+ int16_t *output, int cnt);
ff_fsppdsp_init(&fspp);
commit 68b11cde8212b8ea0309ef6d11b01c782fa0b943
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 21:03:06 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:11 2025 +0100
tests/checkasm/vf_fspp: Add test for column_fidct
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index 29b91f98d7..de407739d9 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -36,6 +36,12 @@
buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \
} while (0)
+#define randomize_buffer_range(buf, min, max) \
+ do { \
+ for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \
+ buf[j] = min + rnd() % (max - min + 1); \
+ } while (0)
+
static void check_store_slice(void)
{
enum {
@@ -124,8 +130,41 @@ static void check_mul_thrmat(void)
}
}
+static void check_column_fidct(void)
+{
+ enum {
+ NB_BLOCKS = 8, ///< arbitrary
+ };
+ FSPPDSPContext fspp;
+ declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data,
+ int16_t *output, int cnt);
+
+ ff_fsppdsp_init(&fspp);
+
+ if (check_func(fspp.column_fidct, "column_fidct")) {
+ DECLARE_ALIGNED(16, int16_t, threshold)[64];
+ DECLARE_ALIGNED(16, int16_t, src)[8*(8*NB_BLOCKS + 6)];
+ DECLARE_ALIGNED(16, int16_t, dst_new)[8*(8*NB_BLOCKS + 6)];
+ DECLARE_ALIGNED(16, int16_t, dst_ref)[8*(8*NB_BLOCKS + 6)];
+
+ randomize_buffer_range(threshold, 0, INT16_MAX);
+ randomize_buffer_range(src, -1284, 1284);
+ randomize_buffers(dst_new);
+ memcpy(dst_ref, dst_new, sizeof(dst_ref));
+
+ call_ref(threshold, src, dst_ref, NB_BLOCKS * 8);
+ call_new(threshold, src, dst_new, NB_BLOCKS * 8);
+
+ if (memcmp(dst_new, dst_ref, sizeof(dst_new)))
+ fail();
+
+ bench_new(threshold, src, dst_new, NB_BLOCKS * 8);
+ }
+}
+
void checkasm_check_vf_fspp(void)
{
check_store_slice();
check_mul_thrmat();
+ check_column_fidct();
}
commit 63493bf0e0909e701b64392be419f69491b8cbf1
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 19:39:35 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:11 2025 +0100
avfilter/x86/vf_fspp: Put shifts into constants
This avoids some shift instructions and also gives us more headroom
in the registers. In fact, I have proven to myself that everything
that is supposed to fit into 16bits now actually does so.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 2f49945c13..f61efc99f8 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -27,10 +27,13 @@ SECTION_RODATA
cextern fspp_dither
pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
+pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14)
pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
+pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13)
pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
+pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13)
pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
pw_4: times 4 dw 4
@@ -211,12 +214,12 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
psubw m2, m6
paddw m7, m1
movq m6, [thrq+4*16+%2]
- psllw m7, 2
+ psllw m7, 1
psubw m5, [thrq+%2]
psubw m2, m6
paddusw m5, [thrq+%2]
paddusw m2, m6
- pmulhw m7, [pw_2D41]
+ pmulhw m7, [pw_5A82]
paddw m5, [thrq+%2]
paddw m2, m6
psubusw m5, [thrq+%2]
@@ -261,15 +264,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m4, m0
movq m7, m3
psubw m3, m4
- psllw m3, 2
- psllw m7, 2
- pmulhw m3, [pw_187E]
+ psllw m7, 1
+ pmulhw m3, [pw_61F8]
psllw m4, 2
- pmulhw m7, [pw_22A3]
- psllw m2, 2
+ pmulhw m7, [pw_4546]
+ psllw m2, 1
pmulhw m4, [pw_539F]
paddw m5, m1
- pmulhw m2, [pw_2D41]
+ pmulhw m2, [pw_5A82]
psubw m6, m1
paddw m7, m3
movq [rsp+8], m5
@@ -313,11 +315,10 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
jnz %1
movq m4, [rsp]
psraw m3, m0, 2
- psllw m0, 1
mova m5, [outq+DCTSIZE*0*2]
- pmulhw m1, m0, [pw_3B21]
- pmulhw m2, m0, [pw_22A3]
- pmulhw m0, [pw_2D41]
+ pmulhw m1, m0, [pw_7642]
+ pmulhw m2, m0, [pw_4546]
+ pmulhw m0, [pw_5A82]
paddw m5, m4
movq m6, [rsp+8]
psubw m2, m1
@@ -360,23 +361,20 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
%macro COLUMN_IDCT 0-1 0
movq m3, m5
psubw m5, m1
- psllw m5, 1
paddw m3, m1
movq m2, m0
psubw m0, m6
- movq m1, m5
- psllw m0, 1
+ psllw m1, m5, 1
pmulhw m1, [pw_AC62]
paddw m5, m0
- pmulhw m5, [pw_3B21]
+ pmulhw m5, [pw_7642]
paddw m2, m6
- pmulhw m0, [pw_22A3]
+ pmulhw m0, [pw_4546]
movq m7, m2
movq m4, [rsp]
psubw m2, m3
- psllw m2, 1
paddw m7, m3
- pmulhw m2, [pw_2D41]
+ pmulhw m2, [pw_5A82]
movq m6, m4
psraw m7, 2
paddw m4, [outq]
commit 66af18d06a3faf9f8960ad6bd5a400701a0cdaab
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 18:44:49 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 12:18:11 2025 +0100
avfilter/x86/vf_fspp: Make ff_column_fidct_mmx() bitexact
It currently is not, because the shortcut mode uses different rounding
than the C code (as well as the non-shortcut code).
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 489e69f8ce..2f49945c13 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -33,9 +33,6 @@ pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
-pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
-pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
-pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
pw_4: times 4 dw 4
pw_2: times 4 dw 2
@@ -315,31 +312,34 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
or tmpd, tmpd
jnz %1
movq m4, [rsp]
- movq m1, m0
- pmulhw m0, [pw_3642]
- movq m2, m1
- movq m5, [outq+DCTSIZE*0*2]
- movq m3, m2
- pmulhw m1, [pw_2441]
+ psraw m3, m0, 2
+ psllw m0, 1
+ mova m5, [outq+DCTSIZE*0*2]
+ pmulhw m1, m0, [pw_3B21]
+ pmulhw m2, m0, [pw_22A3]
+ pmulhw m0, [pw_2D41]
paddw m5, m4
movq m6, [rsp+8]
- psraw m3, 2
- pmulhw m2, [pw_0CBB]
+ psubw m2, m1
psubw m4, m3
movq m7, [outq+DCTSIZE*1*2]
paddw m5, m3
- movq [outq+DCTSIZE*7*2], m4
+ psubw m1, m3
+ mova [outq+DCTSIZE*7*2], m4
+ psubw m0, m1
+ paddw m2, m0
+ mova [outq+DCTSIZE*0*2], m5
paddw m7, m6
movq m3, [rsp+8*2]
- psubw m6, m0
+ psubw m6, m1
movq m4, [outq+DCTSIZE*2*2]
- paddw m7, m0
+ paddw m7, m1
movq [outq], m5
paddw m4, m3
movq [outq+DCTSIZE*6*2], m6
- psubw m3, m1
+ psubw m3, m0
movq m5, [outq+DCTSIZE*5*2]
- paddw m4, m1
+ paddw m4, m0
movq m6, [outq+DCTSIZE*3*2]
paddw m5, m3
movq m0, [rsp+8*3]
@@ -347,9 +347,9 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
movq [outq+DCTSIZE*1*2], m7
paddw m6, m0
movq [outq+DCTSIZE*2*2], m4
- psubw m0, m2
+ paddw m0, m2
movq m7, [outq+DCTSIZE*4*2]
- paddw m6, m2
+ psubw m6, m2
movq [outq+DCTSIZE*5*2], m5
paddw m7, m0
movq [outq+DCTSIZE*3*2], m6
commit 1049a5fba8f9437b94050105be8d32545675315e
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 12 14:21:09 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100
avfilter/vf_fsppdsp: Reduce discrepancies between C code and x86 asm
The x86 assembly uses the following pattern to zero all
the values with abs<threshold:
x -= threshold;
x satu+= threshold (unsigned saturated addition)
x += threshold
x satu-= threshold (unsigned saturated subtraction)
The reference C code meanwhile zeroed everything
with abs <= threshold. This commit makes the C code behave
like the x86 assembly to reduce discrepancies between the two.
An alternative would be to require SSSE3, so that
one can use pabsw, pcmpgtw for abs>threshold, followed by
a pand with the original data. Or one could modify the thresholds
to make both equal.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index 583571bf94..e530bcd06b 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -34,7 +34,7 @@
#define MULTIPLY16H(x,k) (((x) * (k)) >> 16)
#define THRESHOLD(r,x,t) \
- if(((unsigned)((x) + t)) > t * 2) r = (x); \
+ if (((unsigned)((x) + t)) >= t * 2) r = (x); \
else r = 0;
#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n)
commit d19050a1ae90b4ad8e9e2dadc5c8ca0c39301d69
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 10 23:03:23 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100
avfilter/vf_fsppdsp: Use restrict
It is possible because the requirements are fulfilled;
it is also beneficial performance and code-size wise.
For GCC 14 (with -O3), this reduced codesize by 26750B
here; for Clang 20, it was 432B.
Old benchmarks:
mul_thrmat_c: 4.3 ( 1.00x)
mul_thrmat_sse2: 4.3 ( 1.00x)
store_slice_c: 2810.8 ( 1.00x)
store_slice_sse2: 542.5 ( 5.18x)
store_slice2_c: 3817.0 ( 1.00x)
store_slice2_sse2: 410.4 ( 9.30x)
New benchmarks:
mul_thrmat_c: 4.3 ( 1.00x)
mul_thrmat_sse2: 4.3 ( 1.00x)
store_slice_c: 1510.1 ( 1.00x)
store_slice_sse2: 545.2 ( 2.77x)
store_slice2_c: 1763.5 ( 1.00x)
store_slice2_sse2: 408.3 ( 4.32x)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index f3f7c87174..583571bf94 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -64,7 +64,7 @@ DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
};
//This func reads from 1 slice, 1 and clears 0 & 1
-void ff_store_slice_c(uint8_t *dst, int16_t *src,
+void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
{
@@ -93,7 +93,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
}
//This func reads from 2 slices, 0 & 2 and clears 2-nd
-void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
{
@@ -121,13 +121,14 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
}
}
-void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
+void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr,
int q)
{
for (int a = 0; a < 64; a++)
thr_adr[a] = q * thr_adr_noq[a];
}
-void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int
cnt)
+void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+ int16_t *restrict output, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -249,7 +250,8 @@ void ff_column_fidct_c(int16_t *thr_adr, int16_t *data,
int16_t *output, int cnt
}
}
-void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t
output_stride, int cnt)
+void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+ ptrdiff_t output_stride, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -311,7 +313,8 @@ void ff_row_idct_c(int16_t *workspace, int16_t *output_adr,
ptrdiff_t output_str
}
}
-void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size,
int cnt)
+void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
+ ptrdiff_t line_size, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index b440809f02..66030da4b1 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -31,40 +31,43 @@
#include "libavutil/attributes_internal.h"
typedef struct FSPPDSPContext {
- void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */,
+ void (*store_slice)(uint8_t *restrict dst, int16_t *restrict src /* align
16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
- void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */,
+ void (*store_slice2)(uint8_t *restrict dst, int16_t *restrict src /* align
16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
- void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */,
- int16_t *thr_adr /* align 16 */, int q);
+ void (*mul_thrmat)(int16_t *restrict thr_adr_noq /* align 16 */,
+ int16_t *restrict thr_adr /* align 16 */, int q);
- void (*column_fidct)(int16_t *thr_adr, int16_t *data,
- int16_t *output, int cnt);
+ void (*column_fidct)(int16_t *restrict thr_adr, int16_t *data,
+ int16_t *restrict output, int cnt);
- void (*row_idct)(int16_t *workspace, int16_t *output_adr,
+ void (*row_idct)(int16_t *restrict workspace, int16_t *restrict output_adr,
ptrdiff_t output_stride, int cnt);
- void (*row_fdct)(int16_t *data, const uint8_t *pixels,
+ void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels,
ptrdiff_t line_size, int cnt);
} FSPPDSPContext;
FF_VISIBILITY_PUSH_HIDDEN
extern const uint8_t ff_fspp_dither[8][8];
-void ff_store_slice_c(uint8_t *dst, int16_t *src,
+void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
-void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int
cnt);
-void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t
output_stride, int cnt);
-void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size,
int cnt);
+void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr,
int q);
+void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+ int16_t *restrict output, int cnt);
+void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+ ptrdiff_t output_stride, int cnt);
+void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
+ ptrdiff_t line_size, int cnt);
void ff_fsppdsp_init_x86(FSPPDSPContext *fspp);
FF_VISIBILITY_POP_HIDDEN
commit ff85a20b7db4d3226ada8533b181989944f30e75
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 10 22:06:34 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100
avfilter/x86/vf_fspp: Port store_slice to SSE2
Old benchmarks:
store_slice_c: 2798.3 ( 1.00x)
store_slice_mmx: 950.2 ( 2.94x)
store_slice2_c: 3811.7 ( 1.00x)
store_slice2_mmx: 682.3 ( 5.59x)
New benchmarks:
store_slice_c: 2797.2 ( 1.00x)
store_slice_sse2: 543.5 ( 5.15x)
store_slice2_c: 3817.0 ( 1.00x)
store_slice2_sse2: 408.2 ( 9.35x)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index e87fa6861c..b440809f02 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -31,11 +31,11 @@
#include "libavutil/attributes_internal.h"
typedef struct FSPPDSPContext {
- void (*store_slice)(uint8_t *dst, int16_t *src,
+ void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
- void (*store_slice2)(uint8_t *dst, int16_t *src,
+ void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index c9408978d8..489e69f8ce 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -43,15 +43,15 @@ SECTION .text
%define DCTSIZE 8
-INIT_MMX mmx
+INIT_XMM sse2
-;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
-; ptrdiff_t dst_stride, ptrdiff_t src_stride,
-; ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale)
+;void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
+; ptrdiff_t dst_stride, ptrdiff_t src_stride,
+; ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale)
%if ARCH_X86_64
-cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width,
dither_height, dither, tmp, tmp2
+cglobal store_slice, 7, 9, 5, dst, src, dst_stride, src_stride, width,
dither_height, dither, tmp, tmp2
%else
-cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
%define dst_strideq r2m
%define src_strideq r3m
mov widthq, r4m
@@ -62,7 +62,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height,
dither, tmp, tmp2
mov tmpq, src_strideq
and widthq, ~7
sub dst_strideq, widthq
- movd m5, ditherd ; log2_scale
+ movd m4, ditherd ; log2_scale
xor ditherq, -1 ; log2_scale
mov tmp2q, tmpq
add ditherq, 7 ; log2_scale
@@ -74,29 +74,21 @@ cglobal store_slice, 2, 7, 0, dst, src, width,
dither_height, dither, tmp, tmp2
mov src_strideq, tmp2q
shl tmpq, 4
lea dither_heightq, [ditherq+dither_heightq*8]
- pxor m7, m7
+ pxor m1, m1
.loop_height:
movq m3, [ditherq]
- movq m4, m3
- punpcklbw m3, m7
- punpckhbw m4, m7
+ punpcklbw m3, m1
mov tmp2q, widthq
- psraw m3, m5
- psraw m4, m5
+ psraw m3, m4
.loop_width:
- movq [srcq+tmpq], m7
- movq m0, [srcq]
- movq m1, [srcq+8]
- movq [srcq+tmpq+8], m7
+ mova m0, [srcq]
+ mova [srcq+tmpq], m1
paddw m0, m3
- paddw m1, m4
- movq [srcq], m7
+ mova [srcq], m1
psraw m0, m2
- psraw m1, m2
- movq [srcq+8], m7
- packuswb m0, m1
+ packuswb m0, m0
add srcq, 16
movq [dstq], m0
add dstq, 8
@@ -110,13 +102,13 @@ cglobal store_slice, 2, 7, 0, dst, src, width,
dither_height, dither, tmp, tmp2
jl .loop_height
RET
-;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
-; ptrdiff_t dst_stride, ptrdiff_t src_stride,
-; ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale)
+;void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
+; ptrdiff_t dst_stride, ptrdiff_t src_stride,
+; ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale)
%if ARCH_X86_64
-cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width,
dither_height, dither, tmp, tmp2
+cglobal store_slice2, 7, 9, 5, dst, src, dst_stride, src_stride, width,
dither_height, dither, tmp, tmp2
%else
-cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp,
tmp2
+cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp,
tmp2
%define dst_strideq r2m
%define src_strideq r3m
mov dstq, dstm
@@ -129,7 +121,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width,
dither_height, dither, tmp, tmp2
mov tmpq, src_strideq
and widthq, ~7
sub dst_strideq, widthq
- movd m5, ditherd ; log2_scale
+ movd m4, ditherd ; log2_scale
xor ditherq, -1 ; log2_scale
mov tmp2q, tmpq
add ditherq, 7 ; log2_scale
@@ -140,30 +132,21 @@ cglobal store_slice2, 0, 7, 0, dst, src, width,
dither_height, dither, tmp, tmp2
mov src_strideq, tmp2q
shl tmpq, 5
lea dither_heightq, [ditherq+dither_heightq*8]
- pxor m7, m7
+ pxor m1, m1
.loop_height:
movq m3, [ditherq]
- movq m4, m3
- punpcklbw m3, m7
- punpckhbw m4, m7
+ punpcklbw m3, m1
mov tmp2q,widthq
- psraw m3, m5
- psraw m4, m5
+ psraw m3, m4
.loop_width:
- movq m0, [srcq]
- movq m1, [srcq+8]
+ mova m0, [srcq]
paddw m0, m3
paddw m0, [srcq+tmpq]
- paddw m1, m4
- movq m6, [srcq+tmpq+8]
- movq [srcq+tmpq], m7
+ mova [srcq+tmpq], m1
psraw m0, m2
- paddw m1, m6
- movq [srcq+tmpq+8], m7
- psraw m1, m2
- packuswb m0, m1
+ packuswb m0, m0
movq [dstq], m0
add srcq, 16
add dstq, 8
@@ -178,7 +161,6 @@ cglobal store_slice2, 0, 7, 0, dst, src, width,
dither_height, dither, tmp, tmp2
RET
;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-INIT_XMM sse2
cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
movd m4, qd
mova m0, [thrnq]
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 9f6095ce24..ee875547d2 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -23,12 +23,12 @@
#include "libavutil/x86/cpu.h"
#include "libavfilter/vf_fsppdsp.h"
-void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
- ptrdiff_t dst_stride, ptrdiff_t src_stride,
- ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
-void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
+void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int
cnt);
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t
output_stride, int cnt);
@@ -39,13 +39,13 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
- s->store_slice = ff_store_slice_mmx;
- s->store_slice2 = ff_store_slice2_mmx;
s->column_fidct = ff_column_fidct_mmx;
s->row_idct = ff_row_idct_mmx;
s->row_fdct = ff_row_fdct_mmx;
}
if (EXTERNAL_SSE2(cpu_flags)) {
+ s->store_slice = ff_store_slice_sse2;
+ s->store_slice2 = ff_store_slice2_sse2;
s->mul_thrmat = ff_mul_thrmat_sse2;
}
}
commit 570f8fc6c9850edf6c05d58dea0629f162199f20
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 10 21:57:45 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100
tests/checkasm/vf_fspp: Test store_slice
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index 117e1c670e..29b91f98d7 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -16,8 +16,12 @@
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
+#include <stddef.h>
+#include <stdint.h>
+
#include "checkasm.h"
#include "libavfilter/vf_fsppdsp.h"
+#include "libavcodec/mathops.h"
#include "libavutil/mem_internal.h"
#define randomize_buffers(buf) \
@@ -26,6 +30,78 @@
buf[j] = rnd(); \
} while (0)
+#define randomize_mask_buffers(buf, buf2, nb_elems, nb_bits)\
+ do { \
+ for (size_t j = 0; j < nb_elems; ++j) \
+ buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \
+ } while (0)
+
+static void check_store_slice(void)
+{
+ enum {
+ MAX_WIDTH = 256,
+ /// in elements, not in bytes; 32 is arbitrary
+ MAX_STRIDE = MAX_WIDTH + 32,
+ MAX_HEIGHT = 8,
+ };
+ FSPPDSPContext fspp;
+ ff_fsppdsp_init(&fspp);
+ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+
+ for (int i = 0; i < 2; ++i) {
+ if (check_func(i ? fspp.store_slice2 : fspp.store_slice,
"store_slice%s", i ? "2" : "")) {
+ // store slice resets the row eight lines above the current one
+ DECLARE_ALIGNED(16, int16_t, src_ref1)[MAX_STRIDE * ( 8 +
MAX_HEIGHT - 1) + MAX_WIDTH];
+ DECLARE_ALIGNED(16, int16_t, src_new1)[MAX_STRIDE * ( 8 +
MAX_HEIGHT - 1) + MAX_WIDTH];
+ // store_slice2 resets the row 16 lines below the current one
+ DECLARE_ALIGNED(16, int16_t, src_ref2)[MAX_STRIDE * (16 +
MAX_HEIGHT - 1) + MAX_WIDTH];
+ DECLARE_ALIGNED(16, int16_t, src_new2)[MAX_STRIDE * (16 +
MAX_HEIGHT - 1) + MAX_WIDTH];
+ uint8_t dstbuf_new[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH],
dstbuf_ref[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH];
+ uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref;
+ int16_t *src_ref, *src_new, *or_src_ref, *or_src_new;
+ ptrdiff_t width = 1 + rnd() % MAX_WIDTH;
+ ptrdiff_t src_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE -
MAX_WIDTH), 8);
+ ptrdiff_t dst_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE -
MAX_WIDTH), 8);
+ ptrdiff_t height = 1 + rnd() % 8;
+ size_t nb_elems;
+
+ if (i) {
+ src_ref = src_ref2;
+ src_new = src_new2;
+ or_src_ref = src_ref2;
+ or_src_new = src_new2;
+ nb_elems = FF_ARRAY_ELEMS(src_ref2);
+ } else {
+ src_ref = src_ref1 + 8 * src_stride;
+ src_new = src_new1 + 8 * src_stride;
+ or_src_ref = src_ref1;
+ or_src_new = src_new1;
+ nb_elems = FF_ARRAY_ELEMS(src_ref1);
+ }
+ if (rnd() & 1) {
+ dst_ref += dst_stride * (height - 1);
+ dst_new += dst_stride * (height - 1);
+ dst_stride *= -1;
+ }
+ randomize_buffers(dstbuf_new);
+ memcpy(dstbuf_ref, dstbuf_new, sizeof(dstbuf_ref));
+ randomize_mask_buffers(or_src_ref, or_src_new, nb_elems, 14);
+
+ ptrdiff_t log2_scale = rnd() & 1;
+ call_ref(dst_ref, src_ref, dst_stride, src_stride, width, height,
log2_scale);
+ call_new(dst_new, src_new, dst_stride, src_stride, width, height,
log2_scale);
+ if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_ref)) ||
+ memcmp(or_src_ref, or_src_new, sizeof(*or_src_new) * nb_elems))
+ fail();
+ // don't use random parameters for benchmarks
+ src_ref = or_src_ref + !i * 8 * MAX_STRIDE;
+ bench_new(dstbuf_new, src_ref,
+ MAX_STRIDE, MAX_STRIDE, MAX_WIDTH, 8, 1);
+ }
+ }
+}
static void check_mul_thrmat(void)
{
@@ -50,5 +126,6 @@ static void check_mul_thrmat(void)
void checkasm_check_vf_fspp(void)
{
+ check_store_slice();
check_mul_thrmat();
}
commit e042f17e9947779e3b1b981218370472940ca3c6
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 10 12:54:31 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100
avfilter/vf_fsppdsp: Use standard clamping
This is obviously what is intended and what the MMX code does;
yet I cannot rule out that it changes the output for some inputs:
I have observed individual src values which would lead to temp
values just above 512 if they came in pairs (i.e. if both inputs
were simultaneously huge).
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index b84d7b57bb..f3f7c87174 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -24,6 +24,7 @@
#include "vf_fsppdsp.h"
+#include "libavutil/common.h"
#include "libavutil/mathematics.h"
#include "libavutil/mem_internal.h"
@@ -70,7 +71,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
#define STORE(pos)
\
temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);
\
src[x + pos] = src[x + pos - 8 * src_stride] = 0;
\
- if (temp & 0x100) temp = ~(temp >> 31);
\
+ temp = av_clip_uint8(temp);
\
dst[x + pos] = temp;
for (int y = 0; y < height; y++) {
@@ -99,7 +100,7 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
#define STORE2(pos)
\
temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >>
log2_scale)) >> (6 - log2_scale); \
src[x + pos + 16 * src_stride] = 0;
\
- if (temp & 0x100) temp = ~(temp >> 31);
\
+ temp = av_clip_uint8(temp);
\
dst[x + pos] = temp;
for (int y = 0; y < height; y++) {
commit 52ba2ac7bd48d09d1f8527376970e2b0e8ee5068
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 9 19:10:30 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100
avfilter/x86/vf_fspp: Port mul_thrmat to SSE2
This fixes an ABI violation, as mul_thrmat did not issue emms.
It seems that this ABI violation could reach the user, namely
if ff_get_video_buffer() fails. Notice that ff_get_video_buffer()
itself could fail because of this, namely if the allocator uses
floating point registers.
On x64 (where GCC already used SSE2 in the C version)
mul_thrmat_c: 4.4 ( 1.00x)
mul_thrmat_mmx: 8.6 ( 0.52x)
mul_thrmat_sse2: 4.4 ( 1.00x)
On 32bit (where SSE2 is not known to be available):
mul_thrmat_c: 56.0 ( 1.00x)
mul_thrmat_sse2: 6.0 ( 9.40x)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 9371c63e77..fa562cbd45 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -54,8 +54,6 @@
typedef struct FSPPContext {
const struct AVClass *class;
- uint64_t threshold_mtx_noq[8 * 2];
- uint64_t threshold_mtx[8 * 2]; //used in both C & MMX (& later
SSE2) versions
int log2_count;
int strength;
@@ -72,6 +70,9 @@ typedef struct FSPPContext {
int use_bframe_qp;
FSPPDSPContext dsp;
+
+ DECLARE_ALIGNED(16, uint64_t, threshold_mtx_noq)[8 * 2];
+ DECLARE_ALIGNED(16, uint64_t, threshold_mtx)[8 * 2];
} FSPPContext;
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index 0dbd628abf..e87fa6861c 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -39,7 +39,8 @@ typedef struct FSPPDSPContext {
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
- void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+ void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */,
+ int16_t *thr_adr /* align 16 */, int q);
void (*column_fidct)(int16_t *thr_adr, int16_t *data,
int16_t *output, int cnt);
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 0ea6216193..c9408978d8 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -177,59 +177,36 @@ cglobal store_slice2, 0, 7, 0, dst, src, width,
dither_height, dither, tmp, tmp2
jl .loop_height
RET
-;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
- movd m7, qd
- movq m0, [thrnq]
- punpcklwd m7, m7
- movq m1, [thrnq+8]
- punpckldq m7, m7
- pmullw m0, m7
- movq m2, [thrnq+8*2]
- pmullw m1, m7
- movq m3, [thrnq+8*3]
- pmullw m2, m7
- movq [thrq], m0
- movq m4, [thrnq+8*4]
- pmullw m3, m7
- movq [thrq+8], m1
- movq m5, [thrnq+8*5]
- pmullw m4, m7
- movq [thrq+8*2], m2
- movq m6, [thrnq+8*6]
- pmullw m5, m7
- movq [thrq+8*3], m3
- movq m0, [thrnq+8*7]
- pmullw m6, m7
- movq [thrq+8*4], m4
- movq m1, [thrnq+8*7+8]
- pmullw m0, m7
- movq [thrq+8*5], m5
- movq m2, [thrnq+8*7+8*2]
- pmullw m1, m7
- movq [thrq+8*6], m6
- movq m3, [thrnq+8*7+8*3]
- pmullw m2, m7
- movq [thrq+8*7], m0
- movq m4, [thrnq+8*7+8*4]
- pmullw m3, m7
- movq [thrq+8*7+8], m1
- movq m5, [thrnq+8*7+8*5]
- pmullw m4, m7
- movq [thrq+8*7+8*2], m2
- movq m6, [thrnq+8*7+8*6]
- pmullw m5, m7
- movq [thrq+8*7+8*3], m3
- movq m0, [thrnq+14*8]
- pmullw m6, m7
- movq [thrq+8*7+8*4], m4
- movq m1, [thrnq+14*8+8]
- pmullw m0, m7
- movq [thrq+8*7+8*5], m5
- pmullw m1, m7
- movq [thrq+8*7+8*6], m6
- movq [thrq+14*8], m0
- movq [thrq+14*8+8], m1
+;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+INIT_XMM sse2
+cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
+ movd m4, qd
+ mova m0, [thrnq]
+ punpcklwd m4, m4
+ mova m1, [thrnq+16]
+ pshufd m4, m4, 0
+ pmullw m0, m4
+ mova m2, [thrnq+16*2]
+ pmullw m1, m4
+ mova m3, [thrnq+16*3]
+ pmullw m2, m4
+ mova [thrq], m0
+ mova m0, [thrnq+16*4]
+ pmullw m3, m4
+ mova [thrq+16], m1
+ mova m1, [thrnq+16*5]
+ pmullw m0, m4
+ mova [thrq+16*2], m2
+ mova m2, [thrnq+16*6]
+ pmullw m1, m4
+ mova [thrq+16*3], m3
+ mova m3, [thrnq+16*7]
+ pmullw m2, m4
+ mova [thrq+16*4], m0
+ pmullw m3, m4
+ mova [thrq+16*5], m1
+ mova [thrq+16*6], m2
+ mova [thrq+16*7], m3
RET
%macro COLUMN_FDCT 1-3 0, 0
@@ -457,6 +434,7 @@ cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
add outq, 8+%1
%endmacro
+INIT_MMX mmx
;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output,
int cnt);
cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
.fdct1:
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 2aadb50967..9f6095ce24 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -29,7 +29,7 @@ void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
-void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int
cnt);
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t
output_stride, int cnt);
void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t
line_size, int cnt);
@@ -41,9 +41,11 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
if (EXTERNAL_MMX(cpu_flags)) {
s->store_slice = ff_store_slice_mmx;
s->store_slice2 = ff_store_slice2_mmx;
- s->mul_thrmat = ff_mul_thrmat_mmx;
s->column_fidct = ff_column_fidct_mmx;
s->row_idct = ff_row_idct_mmx;
s->row_fdct = ff_row_fdct_mmx;
}
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ s->mul_thrmat = ff_mul_thrmat_sse2;
+ }
}
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index a84ae8d5af..117e1c670e 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -18,6 +18,7 @@
#include "checkasm.h"
#include "libavfilter/vf_fsppdsp.h"
+#include "libavutil/mem_internal.h"
#define randomize_buffers(buf) \
do { \
@@ -29,10 +30,11 @@
static void check_mul_thrmat(void)
{
FSPPDSPContext fspp;
- int16_t src[64];
- int16_t dst_ref[64], dst_new[64];
+ DECLARE_ALIGNED(16, int16_t, src)[64];
+ DECLARE_ALIGNED(16, int16_t, dst_ref)[64];
+ DECLARE_ALIGNED(16, int16_t, dst_new)[64];
const int q = (uint8_t)rnd();
- declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr_noq, int16_t
*thr_adr, int q);
+ declare_func(void, int16_t *thr_adr_noq, int16_t *thr_adr, int q);
ff_fsppdsp_init(&fspp);
commit 70eb8a76a91e9c9fe3a6c0b4f1c2ff28f5447086
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 9 18:50:48 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100
tests/checkasm: Add vf_fspp mul_thrmat test
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index e47070d90f..6636bc7774 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -64,6 +64,7 @@ AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o
AVFILTEROBJS-$(CONFIG_COLORDETECT_FILTER)+= vf_colordetect.o
AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
AVFILTEROBJS-$(CONFIG_EQ_FILTER) += vf_eq.o
+AVFILTEROBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o
AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o
AVFILTEROBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o
AVFILTEROBJS-$(CONFIG_IDET_FILTER) += vf_idet.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 4469e043f5..20d8f19757 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -297,6 +297,9 @@ static const struct {
#if CONFIG_EQ_FILTER
{ "vf_eq", checkasm_check_vf_eq },
#endif
+ #if CONFIG_FSPP_FILTER
+ { "vf_fspp", checkasm_check_vf_fspp },
+ #endif
#if CONFIG_GBLUR_FILTER
{ "vf_gblur", checkasm_check_vf_gblur },
#endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index e1ccd4011b..45cd23cac4 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -148,6 +148,7 @@ void checkasm_check_v210enc(void);
void checkasm_check_vc1dsp(void);
void checkasm_check_vf_bwdif(void);
void checkasm_check_vf_eq(void);
+void checkasm_check_vf_fspp(void);
void checkasm_check_vf_gblur(void);
void checkasm_check_vf_hflip(void);
void checkasm_check_vf_threshold(void);
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
new file mode 100644
index 0000000000..a84ae8d5af
--- /dev/null
+++ b/tests/checkasm/vf_fspp.c
@@ -0,0 +1,52 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "checkasm.h"
+#include "libavfilter/vf_fsppdsp.h"
+
+#define randomize_buffers(buf) \
+ do { \
+ for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \
+ buf[j] = rnd(); \
+ } while (0)
+
+
+static void check_mul_thrmat(void)
+{
+ FSPPDSPContext fspp;
+ int16_t src[64];
+ int16_t dst_ref[64], dst_new[64];
+ const int q = (uint8_t)rnd();
+ declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr_noq, int16_t
*thr_adr, int q);
+
+ ff_fsppdsp_init(&fspp);
+
+ if (check_func(fspp.mul_thrmat, "mul_thrmat")) {
+ randomize_buffers(src);
+ call_ref(src, dst_ref, q);
+ call_new(src, dst_new, q);
+ if (memcmp(dst_ref, dst_new, sizeof(dst_ref)))
+ fail();
+ bench_new(src, dst_new, q);
+ }
+}
+
+void checkasm_check_vf_fspp(void)
+{
+ check_mul_thrmat();
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index ca1cd0dea3..2be880c8db 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -67,6 +67,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp
\
fate-checkasm-vf_colordetect \
fate-checkasm-vf_colorspace \
fate-checkasm-vf_eq \
+ fate-checkasm-vf_fspp \
fate-checkasm-vf_gblur \
fate-checkasm-vf_hflip \
fate-checkasm-vf_nlmeans \
commit 9f4d5d818d709788ab6b199a634a95a2bfcd4898
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 9 17:27:16 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100
avfilter/x86/vf_fspp: Don't duplicate dither table
Reuse the one from vf_fsppdsp.c; also don't overalign said table too
much.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index d2d04463b4..b84d7b57bb 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -51,7 +51,7 @@ enum {
FIX_1_082392200 = FIX(1.082392200, 13),
};
-DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
+DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
{ 0, 48, 12, 60, 3, 51, 15, 63, },
{ 32, 16, 44, 28, 35, 19, 47, 31, },
{ 8, 56, 4, 52, 11, 59, 7, 55, },
@@ -74,7 +74,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
dst[x + pos] = temp;
for (int y = 0; y < height; y++) {
- const uint8_t *d = dither[y];
+ const uint8_t *d = ff_fspp_dither[y];
for (int x = 0; x < width; x += 8) {
int temp;
STORE(0);
@@ -103,7 +103,7 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
dst[x + pos] = temp;
for (int y = 0; y < height; y++) {
- const uint8_t *d = dither[y];
+ const uint8_t *d = ff_fspp_dither[y];
for (int x = 0; x < width; x += 8) {
int temp;
STORE2(0);
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index c441b75094..0dbd628abf 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -52,6 +52,8 @@ typedef struct FSPPDSPContext {
} FSPPDSPContext;
FF_VISIBILITY_PUSH_HIDDEN
+extern const uint8_t ff_fspp_dither[8][8];
+
void ff_store_slice_c(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index c7f8f64f1b..0ea6216193 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -25,10 +25,7 @@
SECTION_RODATA
-pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35,
19, 47, 31, \
- 8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43,
27, 39, 23, \
- 2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33,
17, 45, 29, \
- 10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41,
25, 37, 21
+cextern fspp_dither
pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
@@ -73,7 +70,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height,
dither, tmp, tmp2
sub tmp2q, widthq
movd m2, ditherd ; log2_scale
add tmp2q, tmp2q
- lea ditherq, [pb_dither]
+ lea ditherq, [fspp_dither]
mov src_strideq, tmp2q
shl tmpq, 4
lea dither_heightq, [ditherq+dither_heightq*8]
@@ -139,7 +136,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width,
dither_height, dither, tmp, tmp2
sub tmp2q, widthq
movd m2, ditherd ; log2_scale
add tmp2q, tmp2q
- lea ditherq, [pb_dither]
+ lea ditherq, [fspp_dither]
mov src_strideq, tmp2q
shl tmpq, 5
lea dither_heightq, [ditherq+dither_heightq*8]
commit 1699de09551da5efe413637fcb4c90bcaea31b4c
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 9 17:22:21 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100
avfilter/vf_fsppdsp: Use enum for constants
It means that the compiler does not have to optimize the static const
object away.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index ab31c77203..d2d04463b4 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -29,7 +29,7 @@
#define DCTSIZE 8
-#define FIX(x,s) ((x) * (1 << s) + 0.5)
+#define FIX(x,s) (int)((x) * (1 << s) + 0.5)
#define MULTIPLY16H(x,k) (((x) * (k)) >> 16)
#define THRESHOLD(r,x,t) \
@@ -38,15 +38,18 @@
#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n)
typedef int32_t int_simd16_t;
-static const int16_t FIX_0_382683433 = FIX(0.382683433, 14);
-static const int16_t FIX_0_541196100 = FIX(0.541196100, 14);
-static const int16_t FIX_0_707106781 = FIX(M_SQRT1_2 , 14);
-static const int16_t FIX_1_306562965 = FIX(1.306562965, 14);
-static const int16_t FIX_1_414213562_A = FIX(M_SQRT2 , 14);
-static const int16_t FIX_1_847759065 = FIX(1.847759065, 13);
-static const int16_t FIX_2_613125930 = FIX(-2.613125930, 13);
-static const int16_t FIX_1_414213562 = FIX(M_SQRT2 , 13);
-static const int16_t FIX_1_082392200 = FIX(1.082392200, 13);
+
+enum {
+ FIX_0_382683433 = FIX(0.382683433, 14),
+ FIX_0_541196100 = FIX(0.541196100, 14),
+ FIX_0_707106781 = FIX(M_SQRT1_2 , 14),
+ FIX_1_306562965 = FIX(1.306562965, 14),
+ FIX_1_414213562_A = FIX(M_SQRT2 , 14),
+ FIX_1_847759065 = FIX(1.847759065, 13),
+ FIX_2_613125930 = FIX(-2.613125930, 13),
+ FIX_1_414213562 = FIX(M_SQRT2 , 13),
+ FIX_1_082392200 = FIX(1.082392200, 13),
+};
DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
{ 0, 48, 12, 60, 3, 51, 15, 63, },
commit 9b34088c4dfec112170a0a0102acb3be1d77d240
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 9 17:06:46 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Mon Nov 17 11:28:04 2025 +0100
avfilter/vf_fspp: Add DSPCtx, move DSP functions to file of their own
This is in preparation for adding checkasm tests; without it,
checkasm would pull all of libavfilter in.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 69d74183b2..d56a458e45 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -329,7 +329,7 @@ OBJS-$(CONFIG_FRAMESTEP_FILTER) +=
vf_framestep.o
OBJS-$(CONFIG_FREEZEDETECT_FILTER) += vf_freezedetect.o
OBJS-$(CONFIG_FREEZEFRAMES_FILTER) += vf_freezeframes.o
OBJS-$(CONFIG_FREI0R_FILTER) += vf_frei0r.o
-OBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o qp_table.o
+OBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o vf_fsppdsp.o
qp_table.o
OBJS-$(CONFIG_FSYNC_FILTER) += vf_fsync.o
OBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o
OBJS-$(CONFIG_GBLUR_VULKAN_FILTER) += vf_gblur_vulkan.o vulkan.o
vulkan_filter.o
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 6b4a715367..9371c63e77 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -41,12 +41,40 @@
#include "libavutil/mem_internal.h"
#include "libavutil/opt.h"
#include "libavutil/pixdesc.h"
+#include "libavutil/video_enc_params.h"
+#include "avfilter.h"
#include "filters.h"
#include "qp_table.h"
-#include "vf_fspp.h"
+#include "vf_fsppdsp.h"
#include "video.h"
+#define BLOCKSZ 12
+#define MAX_LEVEL 5
+
+typedef struct FSPPContext {
+ const struct AVClass *class;
+ uint64_t threshold_mtx_noq[8 * 2];
+ uint64_t threshold_mtx[8 * 2]; //used in both C & MMX (& later
SSE2) versions
+
+ int log2_count;
+ int strength;
+ int hsub;
+ int vsub;
+ int temp_stride;
+ int qp;
+ enum AVVideoEncParamsType qscale_type;
+ int prev_q;
+ uint8_t *src;
+ int16_t *temp;
+ int8_t *non_b_qp_table;
+ int non_b_qp_stride;
+ int use_bframe_qp;
+
+ FSPPDSPContext dsp;
+} FSPPContext;
+
+
#define OFFSET(x) offsetof(FSPPContext, x)
#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
static const AVOption fspp_options[] = {
@@ -59,17 +87,6 @@ static const AVOption fspp_options[] = {
AVFILTER_DEFINE_CLASS(fspp);
-DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
- { 0, 48, 12, 60, 3, 51, 15, 63, },
- { 32, 16, 44, 28, 35, 19, 47, 31, },
- { 8, 56, 4, 52, 11, 59, 7, 55, },
- { 40, 24, 36, 20, 43, 27, 39, 23, },
- { 2, 50, 14, 62, 1, 49, 13, 61, },
- { 34, 18, 46, 30, 33, 17, 45, 29, },
- { 10, 58, 6, 54, 9, 57, 5, 53, },
- { 42, 26, 38, 22, 41, 25, 37, 21, },
-};
-
static const short custom_threshold[64] = {
// values (296) can't be too high
// -it causes too big quant dependence
@@ -84,73 +101,6 @@ static const short custom_threshold[64] = {
20, 27, 26, 23, 20, 15, 11, 5
};
-//This func reads from 1 slice, 1 and clears 0 & 1
-static void store_slice_c(uint8_t *dst, int16_t *src,
- ptrdiff_t dst_stride, ptrdiff_t src_stride,
- ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale)
-{
- int y, x;
-#define STORE(pos)
\
- temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);
\
- src[x + pos] = src[x + pos - 8 * src_stride] = 0;
\
- if (temp & 0x100) temp = ~(temp >> 31);
\
- dst[x + pos] = temp;
-
- for (y = 0; y < height; y++) {
- const uint8_t *d = dither[y];
- for (x = 0; x < width; x += 8) {
- int temp;
- STORE(0);
- STORE(1);
- STORE(2);
- STORE(3);
- STORE(4);
- STORE(5);
- STORE(6);
- STORE(7);
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-//This func reads from 2 slices, 0 & 2 and clears 2-nd
-static void store_slice2_c(uint8_t *dst, int16_t *src,
- ptrdiff_t dst_stride, ptrdiff_t src_stride,
- ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale)
-{
- int y, x;
-#define STORE2(pos)
\
- temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >>
log2_scale)) >> (6 - log2_scale); \
- src[x + pos + 16 * src_stride] = 0;
\
- if (temp & 0x100) temp = ~(temp >> 31);
\
- dst[x + pos] = temp;
-
- for (y = 0; y < height; y++) {
- const uint8_t *d = dither[y];
- for (x = 0; x < width; x += 8) {
- int temp;
- STORE2(0);
- STORE2(1);
- STORE2(2);
- STORE2(3);
- STORE2(4);
- STORE2(5);
- STORE2(6);
- STORE2(7);
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
-{
- int a;
- for (a = 0; a < 64; a++)
- thr_adr[a] = q * thr_adr_noq[a];
-}
-
static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
int dst_stride, int src_stride,
int width, int height,
@@ -197,13 +147,13 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t
*src,
if (qy < 0) qy = 0;
qy = (qy >> qpsv) * qp_stride;
- p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
+ p->dsp.row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ -
1)) {
- p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 -
(y&1), stride, 2 * (BLOCKSZ - 1));
+ p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 -
(y&1), stride, 2 * (BLOCKSZ - 1));
if (p->qp)
- p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 *
8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
+ p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block +
0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
else
for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
t = x + x0 - 2; //correct
t=x+x0-2-(y&1), but its the same
@@ -213,288 +163,42 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t
*src,
t = qp_store[qy + (t >> qpsh)];
t = ff_norm_qscale(t, p->qscale_type);
- if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t
*)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
- p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block +
x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
+ if (t != p->prev_q) p->prev_q = t,
p->dsp.mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t
*)(&p->threshold_mtx[0]), t);
+ p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]),
block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
}
- p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 -
(y & 1), stride, 2 * (BLOCKSZ - 1));
+ p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 +
2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
memmove(block, block + (BLOCKSZ - 1) * 64, 8 * 8 *
sizeof(int16_t)); //cycling
memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 *
sizeof(int16_t));
}
es = width + 8 - x0; // 8, ...
if (es > 8)
- p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y &
1), stride, (es - 4) >> 2);
+ p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 -
(y & 1), stride, (es - 4) >> 2);
- p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3,
es&(~1));
+ p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3,
es&(~1));
if (es > 3)
- p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 -
(y & 1), stride, es >> 2);
+ p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 +
2 - (y & 1), stride, es >> 2);
if (!(y1 & 7) && y1) {
if (y1 & 8)
- p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 *
stride,
- dst_stride, stride, width, 8, 5 -
p->log2_count);
+ p->dsp.store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 +
8 * stride,
+ dst_stride, stride, width, 8, 5 -
p->log2_count);
else
- p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 *
stride,
- dst_stride, stride, width, 8, 5 -
p->log2_count);
+ p->dsp.store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 +
0 * stride,
+ dst_stride, stride, width, 8, 5 -
p->log2_count);
}
}
if (y & 7) { // height % 8 != 0
if (y & 8)
- p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8
* stride,
- dst_stride, stride, width, y&7, 5 - p->log2_count);
+ p->dsp.store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8
+ 8 * stride,
+ dst_stride, stride, width, y&7, 5 -
p->log2_count);
else
- p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0
* stride,
+ p->dsp.store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8
+ 0 * stride,
dst_stride, stride, width, y&7, 5 - p->log2_count);
}
}
-static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output,
int cnt)
-{
- int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int_simd16_t tmp10, tmp11, tmp12, tmp13;
- int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
- int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
-
- int16_t *dataptr;
- int16_t *wsptr;
- int16_t *threshold;
- int ctr;
-
- dataptr = data;
- wsptr = output;
-
- for (; cnt > 0; cnt -= 2) { //start positions
- threshold = (int16_t *)thr_adr;//threshold_mtx
- for (ctr = DCTSIZE; ctr > 0; ctr--) {
- // Process columns from input, add to output.
- tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
- tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
-
- tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
- tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
-
- tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
- tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
-
- tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
- tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
-
- // Even part of FDCT
-
- tmp10 = tmp0 + tmp3;
- tmp13 = tmp0 - tmp3;
- tmp11 = tmp1 + tmp2;
- tmp12 = tmp1 - tmp2;
-
- d0 = tmp10 + tmp11;
- d4 = tmp10 - tmp11;
-
- z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
- d2 = tmp13 + z1;
- d6 = tmp13 - z1;
-
- // Even part of IDCT
-
- THRESHOLD(tmp0, d0, threshold[0 * 8]);
- THRESHOLD(tmp1, d2, threshold[2 * 8]);
- THRESHOLD(tmp2, d4, threshold[4 * 8]);
- THRESHOLD(tmp3, d6, threshold[6 * 8]);
- tmp0 += 2;
- tmp10 = (tmp0 + tmp2) >> 2;
- tmp11 = (tmp0 - tmp2) >> 2;
-
- tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides)
- tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13;
//<<2
-
- tmp0 = tmp10 + tmp13; //->temps
- tmp3 = tmp10 - tmp13; //->temps
- tmp1 = tmp11 + tmp12; //->temps
- tmp2 = tmp11 - tmp12; //->temps
-
- // Odd part of FDCT
-
- tmp10 = tmp4 + tmp5;
- tmp11 = tmp5 + tmp6;
- tmp12 = tmp6 + tmp7;
-
- z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
- z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
- z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
- z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
-
- z11 = tmp7 + z3;
- z13 = tmp7 - z3;
-
- d5 = z13 + z2;
- d3 = z13 - z2;
- d1 = z11 + z4;
- d7 = z11 - z4;
-
- // Odd part of IDCT
-
- THRESHOLD(tmp4, d1, threshold[1 * 8]);
- THRESHOLD(tmp5, d3, threshold[3 * 8]);
- THRESHOLD(tmp6, d5, threshold[5 * 8]);
- THRESHOLD(tmp7, d7, threshold[7 * 8]);
-
- //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
- z13 = tmp6 + tmp5;
- z10 = (tmp6 - tmp5) << 1;
- z11 = tmp4 + tmp7;
- z12 = (tmp4 - tmp7) << 1;
-
- tmp7 = (z11 + z13) >> 2; //+2 !
- tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
- z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
- tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
- tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // -
!!
-
- tmp6 = tmp12 - tmp7;
- tmp5 = tmp11 - tmp6;
- tmp4 = tmp10 + tmp5;
-
- wsptr[DCTSIZE * 0] += (tmp0 + tmp7);
- wsptr[DCTSIZE * 1] += (tmp1 + tmp6);
- wsptr[DCTSIZE * 2] += (tmp2 + tmp5);
- wsptr[DCTSIZE * 3] += (tmp3 - tmp4);
- wsptr[DCTSIZE * 4] += (tmp3 + tmp4);
- wsptr[DCTSIZE * 5] += (tmp2 - tmp5);
- wsptr[DCTSIZE * 6] = (tmp1 - tmp6);
- wsptr[DCTSIZE * 7] = (tmp0 - tmp7);
- //
- dataptr++; //next column
- wsptr++;
- threshold++;
- }
- dataptr += 8; //skip each second start pos
- wsptr += 8;
- }
-}
-
-static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t
output_stride, int cnt)
-{
- int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int_simd16_t tmp10, tmp11, tmp12, tmp13;
- int_simd16_t z5, z10, z11, z12, z13;
- int16_t *outptr;
- int16_t *wsptr;
-
- cnt *= 4;
- wsptr = workspace;
- outptr = output_adr;
- for (; cnt > 0; cnt--) {
- // Even part
- //Simd version reads 4x4 block and transposes it
- tmp10 = wsptr[2] + wsptr[3];
- tmp11 = wsptr[2] - wsptr[3];
-
- tmp13 = wsptr[0] + wsptr[1];
- tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) -
tmp13;//this shift order to avoid overflow
-
- tmp0 = tmp10 + tmp13; //->temps
- tmp3 = tmp10 - tmp13; //->temps
- tmp1 = tmp11 + tmp12;
- tmp2 = tmp11 - tmp12;
-
- // Odd part
- //Also transpose, with previous:
- // ---- ---- ||||
- // ---- ---- idct ||||
- // ---- ---- ---> ||||
- // ---- ---- ||||
- z13 = wsptr[4] + wsptr[5];
- z10 = wsptr[4] - wsptr[5];
- z11 = wsptr[6] + wsptr[7];
- z12 = wsptr[6] - wsptr[7];
-
- tmp7 = z11 + z13;
- tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
-
- z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
- tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
- tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
-
- tmp6 = (tmp12 << 3) - tmp7;
- tmp5 = (tmp11 << 3) - tmp6;
- tmp4 = (tmp10 << 3) + tmp5;
-
- // Final output stage: descale and write column
- outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
- outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
- outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
- outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
- outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
- outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
- outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
- outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
- outptr++;
-
- wsptr += DCTSIZE; // advance pointer to next row
- }
-}
-
-static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t
line_size, int cnt)
-{
- int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int_simd16_t tmp10, tmp11, tmp12, tmp13;
- int_simd16_t z1, z2, z3, z4, z5, z11, z13;
- int16_t *dataptr;
-
- cnt *= 4;
- // Pass 1: process rows.
-
- dataptr = data;
- for (; cnt > 0; cnt--) {
- tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
- tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
- tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
- tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
- tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
- tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
- tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
- tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
-
- // Even part
-
- tmp10 = tmp0 + tmp3;
- tmp13 = tmp0 - tmp3;
- tmp11 = tmp1 + tmp2;
- tmp12 = tmp1 - tmp2;
- //Even columns are written first, this leads to different order of
columns
- //in column_fidct(), but they are processed independently, so all ok.
- //Later in the row_idct() columns are read in the same order.
- dataptr[2] = tmp10 + tmp11;
- dataptr[3] = tmp10 - tmp11;
-
- z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
- dataptr[0] = tmp13 + z1;
- dataptr[1] = tmp13 - z1;
-
- // Odd part
-
- tmp10 = (tmp4 + tmp5) << 2;
- tmp11 = (tmp5 + tmp6) << 2;
- tmp12 = (tmp6 + tmp7) << 2;
-
- z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
- z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
- z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
- z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
-
- z11 = tmp7 + z3;
- z13 = tmp7 - z3;
-
- dataptr[4] = z13 + z2;
- dataptr[5] = z13 - z2;
- dataptr[6] = z11 + z4;
- dataptr[7] = z11 - z4;
-
- pixels++; // advance pointer to next column
- dataptr += DCTSIZE;
- }
-}
-
static const enum AVPixelFormat pix_fmts[] = {
AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P,
AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV411P,
@@ -522,16 +226,7 @@ static int config_input(AVFilterLink *inlink)
if (!fspp->temp || !fspp->src)
return AVERROR(ENOMEM);
- fspp->store_slice = store_slice_c;
- fspp->store_slice2 = store_slice2_c;
- fspp->mul_thrmat = mul_thrmat_c;
- fspp->column_fidct = column_fidct_c;
- fspp->row_idct = row_idct_c;
- fspp->row_fdct = row_fdct_c;
-
-#if ARCH_X86
- ff_fspp_init_x86(fspp);
-#endif
+ ff_fsppdsp_init(&fspp->dsp);
return 0;
}
@@ -567,7 +262,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
}
if (fspp->qp)
- fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t
*)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]),
fspp->qp);
+ fspp->prev_q = fspp->qp, fspp->dsp.mul_thrmat((int16_t
*)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]),
fspp->qp);
/* if we are not in a constant user quantizer mode and we don't want to use
* the quantizers from the B-frames (B-frames often have a higher QP), we
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
new file mode 100644
index 0000000000..ab31c77203
--- /dev/null
+++ b/libavfilter/vf_fsppdsp.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <[email protected]>
+ * Copyright (C) 2005 Nikolaj Poroshin <[email protected]>
+ * Copyright (c) 2014 Arwa Arif <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdint.h>
+
+#include "vf_fsppdsp.h"
+
+#include "libavutil/mathematics.h"
+#include "libavutil/mem_internal.h"
+
+#define DCTSIZE 8
+
+#define FIX(x,s) ((x) * (1 << s) + 0.5)
+
+#define MULTIPLY16H(x,k) (((x) * (k)) >> 16)
+#define THRESHOLD(r,x,t) \
+ if(((unsigned)((x) + t)) > t * 2) r = (x); \
+ else r = 0;
+#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n)
+
+typedef int32_t int_simd16_t;
+static const int16_t FIX_0_382683433 = FIX(0.382683433, 14);
+static const int16_t FIX_0_541196100 = FIX(0.541196100, 14);
+static const int16_t FIX_0_707106781 = FIX(M_SQRT1_2 , 14);
+static const int16_t FIX_1_306562965 = FIX(1.306562965, 14);
+static const int16_t FIX_1_414213562_A = FIX(M_SQRT2 , 14);
+static const int16_t FIX_1_847759065 = FIX(1.847759065, 13);
+static const int16_t FIX_2_613125930 = FIX(-2.613125930, 13);
+static const int16_t FIX_1_414213562 = FIX(M_SQRT2 , 13);
+static const int16_t FIX_1_082392200 = FIX(1.082392200, 13);
+
+DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
+ { 0, 48, 12, 60, 3, 51, 15, 63, },
+ { 32, 16, 44, 28, 35, 19, 47, 31, },
+ { 8, 56, 4, 52, 11, 59, 7, 55, },
+ { 40, 24, 36, 20, 43, 27, 39, 23, },
+ { 2, 50, 14, 62, 1, 49, 13, 61, },
+ { 34, 18, 46, 30, 33, 17, 45, 29, },
+ { 10, 58, 6, 54, 9, 57, 5, 53, },
+ { 42, 26, 38, 22, 41, 25, 37, 21, },
+};
+
+//This func reads from 1 slice, 1 and clears 0 & 1
+void ff_store_slice_c(uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+{
+#define STORE(pos)
\
+ temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);
\
+ src[x + pos] = src[x + pos - 8 * src_stride] = 0;
\
+ if (temp & 0x100) temp = ~(temp >> 31);
\
+ dst[x + pos] = temp;
+
+ for (int y = 0; y < height; y++) {
+ const uint8_t *d = dither[y];
+ for (int x = 0; x < width; x += 8) {
+ int temp;
+ STORE(0);
+ STORE(1);
+ STORE(2);
+ STORE(3);
+ STORE(4);
+ STORE(5);
+ STORE(6);
+ STORE(7);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+//This func reads from 2 slices, 0 & 2 and clears 2-nd
+void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+{
+#define STORE2(pos)
\
+ temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >>
log2_scale)) >> (6 - log2_scale); \
+ src[x + pos + 16 * src_stride] = 0;
\
+ if (temp & 0x100) temp = ~(temp >> 31);
\
+ dst[x + pos] = temp;
+
+ for (int y = 0; y < height; y++) {
+ const uint8_t *d = dither[y];
+ for (int x = 0; x < width; x += 8) {
+ int temp;
+ STORE2(0);
+ STORE2(1);
+ STORE2(2);
+ STORE2(3);
+ STORE2(4);
+ STORE2(5);
+ STORE2(6);
+ STORE2(7);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
+{
+ for (int a = 0; a < 64; a++)
+ thr_adr[a] = q * thr_adr_noq[a];
+}
+
+void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int
cnt)
+{
+ int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int_simd16_t tmp10, tmp11, tmp12, tmp13;
+ int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
+ int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
+
+ int16_t *dataptr;
+ int16_t *wsptr;
+ int16_t *threshold;
+
+ dataptr = data;
+ wsptr = output;
+
+ for (; cnt > 0; cnt -= 2) { //start positions
+ threshold = (int16_t *)thr_adr;//threshold_mtx
+ for (int ctr = DCTSIZE; ctr > 0; ctr--) {
+ // Process columns from input, add to output.
+ tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+ tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+
+ tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+ tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+
+ tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+ tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+
+ tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+ tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+
+ // Even part of FDCT
+
+ tmp10 = tmp0 + tmp3;
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ d0 = tmp10 + tmp11;
+ d4 = tmp10 - tmp11;
+
+ z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+ d2 = tmp13 + z1;
+ d6 = tmp13 - z1;
+
+ // Even part of IDCT
+
+ THRESHOLD(tmp0, d0, threshold[0 * 8]);
+ THRESHOLD(tmp1, d2, threshold[2 * 8]);
+ THRESHOLD(tmp2, d4, threshold[4 * 8]);
+ THRESHOLD(tmp3, d6, threshold[6 * 8]);
+ tmp0 += 2;
+ tmp10 = (tmp0 + tmp2) >> 2;
+ tmp11 = (tmp0 - tmp2) >> 2;
+
+ tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides)
+ tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13;
//<<2
+
+ tmp0 = tmp10 + tmp13; //->temps
+ tmp3 = tmp10 - tmp13; //->temps
+ tmp1 = tmp11 + tmp12; //->temps
+ tmp2 = tmp11 - tmp12; //->temps
+
+ // Odd part of FDCT
+
+ tmp10 = tmp4 + tmp5;
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
+ z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
+ z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
+ z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
+
+ z11 = tmp7 + z3;
+ z13 = tmp7 - z3;
+
+ d5 = z13 + z2;
+ d3 = z13 - z2;
+ d1 = z11 + z4;
+ d7 = z11 - z4;
+
+ // Odd part of IDCT
+
+ THRESHOLD(tmp4, d1, threshold[1 * 8]);
+ THRESHOLD(tmp5, d3, threshold[3 * 8]);
+ THRESHOLD(tmp6, d5, threshold[5 * 8]);
+ THRESHOLD(tmp7, d7, threshold[7 * 8]);
+
+ //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
+ z13 = tmp6 + tmp5;
+ z10 = (tmp6 - tmp5) << 1;
+ z11 = tmp4 + tmp7;
+ z12 = (tmp4 - tmp7) << 1;
+
+ tmp7 = (z11 + z13) >> 2; //+2 !
+ tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
+ z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
+ tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
+ tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // -
!!
+
+ tmp6 = tmp12 - tmp7;
+ tmp5 = tmp11 - tmp6;
+ tmp4 = tmp10 + tmp5;
+
+ wsptr[DCTSIZE * 0] += (tmp0 + tmp7);
+ wsptr[DCTSIZE * 1] += (tmp1 + tmp6);
+ wsptr[DCTSIZE * 2] += (tmp2 + tmp5);
+ wsptr[DCTSIZE * 3] += (tmp3 - tmp4);
+ wsptr[DCTSIZE * 4] += (tmp3 + tmp4);
+ wsptr[DCTSIZE * 5] += (tmp2 - tmp5);
+ wsptr[DCTSIZE * 6] = (tmp1 - tmp6);
+ wsptr[DCTSIZE * 7] = (tmp0 - tmp7);
+ //
+ dataptr++; //next column
+ wsptr++;
+ threshold++;
+ }
+ dataptr += 8; //skip each second start pos
+ wsptr += 8;
+ }
+}
+
+void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t
output_stride, int cnt)
+{
+ int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int_simd16_t tmp10, tmp11, tmp12, tmp13;
+ int_simd16_t z5, z10, z11, z12, z13;
+ int16_t *outptr;
+ int16_t *wsptr;
+
+ cnt *= 4;
+ wsptr = workspace;
+ outptr = output_adr;
+ for (; cnt > 0; cnt--) {
+ // Even part
+ //Simd version reads 4x4 block and transposes it
+ tmp10 = wsptr[2] + wsptr[3];
+ tmp11 = wsptr[2] - wsptr[3];
+
+ tmp13 = wsptr[0] + wsptr[1];
+ tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) -
tmp13;//this shift order to avoid overflow
+
+ tmp0 = tmp10 + tmp13; //->temps
+ tmp3 = tmp10 - tmp13; //->temps
+ tmp1 = tmp11 + tmp12;
+ tmp2 = tmp11 - tmp12;
+
+ // Odd part
+ //Also transpose, with previous:
+ // ---- ---- ||||
+ // ---- ---- idct ||||
+ // ---- ---- ---> ||||
+ // ---- ---- ||||
+ z13 = wsptr[4] + wsptr[5];
+ z10 = wsptr[4] - wsptr[5];
+ z11 = wsptr[6] + wsptr[7];
+ z12 = wsptr[6] - wsptr[7];
+
+ tmp7 = z11 + z13;
+ tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
+
+ z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
+ tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
+ tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
+
+ tmp6 = (tmp12 << 3) - tmp7;
+ tmp5 = (tmp11 << 3) - tmp6;
+ tmp4 = (tmp10 << 3) + tmp5;
+
+ // Final output stage: descale and write column
+ outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
+ outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
+ outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
+ outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
+ outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
+ outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
+ outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
+ outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
+ outptr++;
+
+ wsptr += DCTSIZE; // advance pointer to next row
+ }
+}
+
+void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size,
int cnt)
+{
+ int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int_simd16_t tmp10, tmp11, tmp12, tmp13;
+ int_simd16_t z1, z2, z3, z4, z5, z11, z13;
+ int16_t *dataptr;
+
+ cnt *= 4;
+ // Pass 1: process rows.
+
+ dataptr = data;
+ for (; cnt > 0; cnt--) {
+ tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
+ tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
+ tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
+ tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
+ tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
+ tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
+ tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
+ tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
+
+ // Even part
+
+ tmp10 = tmp0 + tmp3;
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+ //Even columns are written first, this leads to different order of
columns
+ //in column_fidct(), but they are processed independently, so all ok.
+ //Later in the row_idct() columns are read in the same order.
+ dataptr[2] = tmp10 + tmp11;
+ dataptr[3] = tmp10 - tmp11;
+
+ z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+ dataptr[0] = tmp13 + z1;
+ dataptr[1] = tmp13 - z1;
+
+ // Odd part
+
+ tmp10 = (tmp4 + tmp5) << 2;
+ tmp11 = (tmp5 + tmp6) << 2;
+ tmp12 = (tmp6 + tmp7) << 2;
+
+ z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
+ z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
+ z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
+ z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
+
+ z11 = tmp7 + z3;
+ z13 = tmp7 - z3;
+
+ dataptr[4] = z13 + z2;
+ dataptr[5] = z13 - z2;
+ dataptr[6] = z11 + z4;
+ dataptr[7] = z11 - z4;
+
+ pixels++; // advance pointer to next column
+ dataptr += DCTSIZE;
+ }
+}
diff --git a/libavfilter/vf_fspp.h b/libavfilter/vf_fsppdsp.h
similarity index 52%
rename from libavfilter/vf_fspp.h
rename to libavfilter/vf_fsppdsp.h
index ee7de3ffef..c441b75094 100644
--- a/libavfilter/vf_fspp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -20,56 +20,17 @@
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
-#ifndef AVFILTER_FSPP_H
-#define AVFILTER_FSPP_H
+#ifndef AVFILTER_FSPPDSP_H
+#define AVFILTER_FSPPDSP_H
-#include "libavutil/video_enc_params.h"
-#include "avfilter.h"
+#include <stddef.h>
+#include <stdint.h>
-#define BLOCKSZ 12
-#define MAX_LEVEL 5
+#include "config.h"
-#define DCTSIZE 8
-#define DCTSIZE_S "8"
-
-#define FIX(x,s) ((x) * (1 << s) + 0.5)
-
-#define MULTIPLY16H(x,k) (((x) * (k)) >> 16)
-#define THRESHOLD(r,x,t) \
- if(((unsigned)((x) + t)) > t * 2) r = (x); \
- else r = 0;
-#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n)
-
-typedef int32_t int_simd16_t;
-static const int16_t FIX_0_382683433 = FIX(0.382683433, 14);
-static const int16_t FIX_0_541196100 = FIX(0.541196100, 14);
-static const int16_t FIX_0_707106781 = FIX(M_SQRT1_2 , 14);
-static const int16_t FIX_1_306562965 = FIX(1.306562965, 14);
-static const int16_t FIX_1_414213562_A = FIX(M_SQRT2 , 14);
-static const int16_t FIX_1_847759065 = FIX(1.847759065, 13);
-static const int16_t FIX_2_613125930 = FIX(-2.613125930, 13);
-static const int16_t FIX_1_414213562 = FIX(M_SQRT2 , 13);
-static const int16_t FIX_1_082392200 = FIX(1.082392200, 13);
-
-typedef struct FSPPContext {
- AVClass *class;
- uint64_t threshold_mtx_noq[8 * 2];
- uint64_t threshold_mtx[8 * 2]; //used in both C & MMX (& later
SSE2) versions
-
- int log2_count;
- int strength;
- int hsub;
- int vsub;
- int temp_stride;
- int qp;
- enum AVVideoEncParamsType qscale_type;
- int prev_q;
- uint8_t *src;
- int16_t *temp;
- int8_t *non_b_qp_table;
- int non_b_qp_stride;
- int use_bframe_qp;
+#include "libavutil/attributes_internal.h"
+typedef struct FSPPDSPContext {
void (*store_slice)(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
@@ -88,9 +49,35 @@ typedef struct FSPPContext {
void (*row_fdct)(int16_t *data, const uint8_t *pixels,
ptrdiff_t line_size, int cnt);
-
-} FSPPContext;
-
-void ff_fspp_init_x86(FSPPContext *fspp);
-
-#endif /* AVFILTER_FSPP_H */
+} FSPPDSPContext;
+
+FF_VISIBILITY_PUSH_HIDDEN
+void ff_store_slice_c(uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t
log2_scale);
+void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int
cnt);
+void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t
output_stride, int cnt);
+void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size,
int cnt);
+
+void ff_fsppdsp_init_x86(FSPPDSPContext *fspp);
+FF_VISIBILITY_POP_HIDDEN
+
+static inline void ff_fsppdsp_init(FSPPDSPContext *fspp)
+{
+ fspp->store_slice = ff_store_slice_c;
+ fspp->store_slice2 = ff_store_slice2_c;
+ fspp->mul_thrmat = ff_mul_thrmat_c;
+ fspp->column_fidct = ff_column_fidct_c;
+ fspp->row_idct = ff_row_idct_c;
+ fspp->row_fdct = ff_row_fdct_c;
+
+#if ARCH_X86
+ ff_fsppdsp_init_x86(fspp);
+#endif
+}
+
+#endif /* AVFILTER_FSPPDSP_H */
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 8e00317cb7..2aadb50967 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -21,7 +21,7 @@
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
-#include "libavfilter/vf_fspp.h"
+#include "libavfilter/vf_fsppdsp.h"
void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
@@ -34,7 +34,7 @@ void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data,
int16_t *output, int c
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t
output_stride, int cnt);
void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t
line_size, int cnt);
-av_cold void ff_fspp_init_x86(FSPPContext *s)
+av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
-----------------------------------------------------------------------
Summary of changes:
libavfilter/Makefile | 2 +-
libavfilter/vf_fspp.c | 450 +++++++---------------------------------
libavfilter/vf_fspp.h | 96 ---------
libavfilter/vf_fsppdsp.c | 371 +++++++++++++++++++++++++++++++++
libavfilter/vf_fsppdsp.h | 89 ++++++++
libavfilter/x86/vf_fspp.asm | 452 ++++++++++++++++++++---------------------
libavfilter/x86/vf_fspp_init.c | 28 +--
libavfilter/x86/vf_spp.c | 2 +-
tests/checkasm/Makefile | 1 +
tests/checkasm/checkasm.c | 3 +
tests/checkasm/checkasm.h | 1 +
tests/checkasm/vf_fspp.c | 170 ++++++++++++++++
tests/fate/checkasm.mak | 1 +
13 files changed, 937 insertions(+), 729 deletions(-)
delete mode 100644 libavfilter/vf_fspp.h
create mode 100644 libavfilter/vf_fsppdsp.c
create mode 100644 libavfilter/vf_fsppdsp.h
create mode 100644 tests/checkasm/vf_fspp.c
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]