[FFmpeg-cvslog] lavc/aarch64: Add neon implementation for vsad_intra16
ffmpeg | branch: master | Hubert Mazur | Thu Sep 8 11:25:05 2022 +0200| [ce03ea3e796bdf9013da51d53e89759a92707c4a] | committer: Martin Storsjö lavc/aarch64: Add neon implementation for vsad_intra16 Provide optimized implementation for vsad_intra16 function for arm64. Performance comparison tests are shown below. - vsad_4_c: 177.5 - vsad_4_neon: 23.5 Benchmarks and tests are run with checkasm tool on AWS Gravtion 3. Signed-off-by: Hubert Mazur Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ce03ea3e796bdf9013da51d53e89759a92707c4a --- libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++ libavcodec/aarch64/me_cmp_neon.S | 48 2 files changed, 51 insertions(+) diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c index 7b81e48d16..af83f7ed1e 100644 --- a/libavcodec/aarch64/me_cmp_init_aarch64.c +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c @@ -43,6 +43,8 @@ int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2, ptrdiff_t stride, int h); +int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy, + ptrdiff_t stride, int h) ; int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2, ptrdiff_t stride, int h); @@ -64,6 +66,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) c->sse[2] = sse4_neon; c->vsad[0] = vsad16_neon; +c->vsad[4] = vsad_intra16_neon; c->vsse[0] = vsse16_neon; } diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index b3f376aa60..ce198ea227 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -736,3 +736,51 @@ function vsse16_neon, export=1 ret endfunc + +function vsad_intra16_neon, export=1 +// x0 unused +// x1 uint8_t *pix1 +// x2 uint8_t *dummy +// x3 ptrdiff_t stride +// w4 int h + +ld1 {v0.16b}, [x1], x3 +sub w4, w4, #1 // we need to make h-1 iterations +cmp w4, #3 +moviv16.8h, #0 +b.lt2f + +// make 4 iterations at once +1: +// v = abs( pix1[0] - pix1[0 + stride] ) +// score = sum(v) +ld1 {v1.16b}, [x1], x3 +ld1 {v2.16b}, [x1], x3 +uabal v16.8h, v0.8b, v1.8b +ld1 {v3.16b}, [x1], x3 +uabal2 v16.8h, v0.16b, v1.16b +sub w4, w4, #3 +uabal v16.8h, v1.8b, v2.8b +cmp w4, #3 +uabal2 v16.8h, v1.16b, v2.16b +mov v0.16b, v3.16b +uabal v16.8h, v2.8b, v3.8b +uabal2 v16.8h, v2.16b, v3.16b +b.ge1b +cbz w4, 3f + +// iterate by one +2: +ld1 {v1.16b}, [x1], x3 +subsw4, w4, #1 +uabal v16.8h, v0.8b, v1.8b +uabal2 v16.8h, v0.16b, v1.16b +mov v0.16b, v1.16b +cbnzw4, 2b + +3: +uaddlv s17, v16.8h +fmovw0, s17 + +ret +endfunc ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavc/aarch64: Add neon implementation for vsad16
ffmpeg | branch: master | Hubert Mazur | Thu Sep 8 11:25:03 2022 +0200| [200f5e578f2fbf70a966f08257e0500a6f1ddd6c] | committer: Martin Storsjö lavc/aarch64: Add neon implementation for vsad16 Provide optimized implementation of vsad16 function for arm64. Performance comparison tests are shown below. - vsad_0_c: 285.2 - vsad_0_neon: 39.5 Benchmarks and tests are run with checkasm tool on AWS Graviton 3. Co-authored-by: Martin Storsjö Signed-off-by: Hubert Mazur Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=200f5e578f2fbf70a966f08257e0500a6f1ddd6c --- libavcodec/aarch64/me_cmp_init_aarch64.c | 5 +++ libavcodec/aarch64/me_cmp_neon.S | 65 2 files changed, 70 insertions(+) diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c index fb7c3f5059..ddc5d05611 100644 --- a/libavcodec/aarch64/me_cmp_init_aarch64.c +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c @@ -41,6 +41,9 @@ int sse8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h); +int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2, +ptrdiff_t stride, int h); + av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) { int cpu_flags = av_get_cpu_flags(); @@ -57,5 +60,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) c->sse[0] = sse16_neon; c->sse[1] = sse8_neon; c->sse[2] = sse4_neon; + +c->vsad[0] = vsad16_neon; } } diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index 4198985c6c..1d0b166d69 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -584,3 +584,68 @@ function sse4_neon, export=1 ret endfunc + +function vsad16_neon, export=1 +// x0 unused +// x1 uint8_t *pix1 +// x2 uint8_t *pix2 +// x3 ptrdiff_t stride +// w4 int h + +ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration +ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration + +sub w4, w4, #1 // we need to make h-1 iterations +moviv16.8h, #0 + +cmp w4, #3 // check if we can make 3 iterations at once +usubl v31.8h, v0.8b, v1.8b// Signed difference pix1[0] - pix2[0], first iteration +usubl2 v30.8h, v0.16b, v1.16b // Signed difference pix1[0] - pix2[0], first iteration + +b.lt2f + +1: +// abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride]) +ld1 {v0.16b}, [x1], x3 // Load pix1[0 + stride], first iteration +ld1 {v1.16b}, [x2], x3 // Load pix2[0 + stride], first iteration +ld1 {v2.16b}, [x1], x3 // Load pix1[0 + stride], second iteration +ld1 {v3.16b}, [x2], x3 // Load pix2[0 + stride], second iteration +usubl v29.8h, v0.8b, v1.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration +usubl2 v28.8h, v0.16b, v1.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration +ld1 {v4.16b}, [x1], x3 // Load pix1[0 + stride], third iteration +ld1 {v5.16b}, [x2], x3 // Load pix2[0 + stride], third iteration +usubl v27.8h, v2.8b, v3.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration +sabav16.8h, v31.8h, v29.8h // Signed absolute difference and accumulate the result. first iteration +usubl2 v26.8h, v2.16b, v3.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration +sabav16.8h, v30.8h, v28.8h // Signed absolute difference and accumulate the result. first iteration +usubl v25.8h, v4.8b, v5.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration +usubl2 v24.8h, v4.16b, v5.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration +sabav16.8h, v29.8h, v27.8h // Signed absolute difference and accumulate the result. second iteration +mov v31.16b, v25.16b +sabav16.8h, v28.8h, v26.8h // Signed absolute difference and accumulate the result. second iteration +sub w4, w4, #3 // h -= 3 +mov
[FFmpeg-cvslog] lavc/aarch64: Add neon implementation of vsse16
ffmpeg | branch: master | Hubert Mazur | Thu Sep 8 11:25:04 2022 +0200| [c495a4b32d352e087318d3a09a9bb4f2b55cfa04] | committer: Martin Storsjö lavc/aarch64: Add neon implementation of vsse16 Provide optimized implementation of vsse16 for arm64. Performance comparison tests are shown below. - vsse_0_c: 257.7 - vsse_0_neon: 59.2 Benchmarks and tests are run with checkasm tool on AWS Graviton 3. Signed-off-by: Hubert Mazur Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c495a4b32d352e087318d3a09a9bb4f2b55cfa04 --- libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++ libavcodec/aarch64/me_cmp_neon.S | 87 2 files changed, 91 insertions(+) diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c index ddc5d05611..7b81e48d16 100644 --- a/libavcodec/aarch64/me_cmp_init_aarch64.c +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c @@ -43,6 +43,8 @@ int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2, ptrdiff_t stride, int h); +int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2, +ptrdiff_t stride, int h); av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) { @@ -62,5 +64,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) c->sse[2] = sse4_neon; c->vsad[0] = vsad16_neon; + +c->vsse[0] = vsse16_neon; } } diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index 1d0b166d69..b3f376aa60 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -649,3 +649,90 @@ function vsad16_neon, export=1 ret endfunc + +function vsse16_neon, export=1 +// x0 unused +// x1 uint8_t *pix1 +// x2 uint8_t *pix2 +// x3 ptrdiff_t stride +// w4 int h + +ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration +ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration + +sub w4, w4, #1 // we need to make h-1 iterations +moviv16.4s, #0 +moviv17.4s, #0 + +cmp w4, #3 // check if we can make 3 iterations at once +usubl v31.8h, v0.8b, v1.8b// Signed difference of pix1[0] - pix2[0], first iteration +usubl2 v30.8h, v0.16b, v1.16b // Signed difference of pix1[0] - pix2[0], first iteration +b.le2f + + +1: +// x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride]) +// res = (x) * (x) +ld1 {v0.16b}, [x1], x3 // Load pix1[0 + stride], first iteration +ld1 {v1.16b}, [x2], x3 // Load pix2[0 + stride], first iteration +ld1 {v2.16b}, [x1], x3 // Load pix1[0 + stride], second iteration +ld1 {v3.16b}, [x2], x3 // Load pix2[0 + stride], second iteration +usubl v29.8h, v0.8b, v1.8b +usubl2 v28.8h, v0.16b, v1.16b +ld1 {v4.16b}, [x1], x3 // Load pix1[0 + stride], third iteration +ld1 {v5.16b}, [x2], x3 // Load pix1[0 + stride], third iteration +sabdv31.8h, v31.8h, v29.8h +sabdv30.8h, v30.8h, v28.8h +usubl v27.8h, v2.8b, v3.8b +usubl2 v26.8h, v2.16b, v3.16b +usubl v25.8h, v4.8b, v5.8b +usubl2 v24.8h, v4.16b, v5.16b +sabdv29.8h, v29.8h, v27.8h +sabdv27.8h, v27.8h, v25.8h +umlal v16.4s, v31.4h, v31.4h +umlal2 v17.4s, v31.8h, v31.8h +sabdv28.8h, v28.8h, v26.8h +sabdv26.8h, v26.8h, v24.8h +umlal v16.4s, v30.4h, v30.4h +umlal2 v17.4s, v30.8h, v30.8h +mov v31.16b, v25.16b +umlal v16.4s, v29.4h, v29.4h +umlal2 v17.4s, v29.8h, v29.8h +mov v30.16b, v24.16b +umlal v16.4s, v28.4h, v28.4h +umlal2 v17.4s, v28.8h, v28.8h +sub w4, w4, #3 +umlal v16.4s, v27.4h, v27.4h +umlal2 v17.4s, v27.8h, v27.8h +cmp w4, #3 +umlal v16.4s, v26.4h, v26.4h +umlal2 v17.4s, v26.8h, v26.8h + +b.ge1b + +cbz w4, 3f + +// iterate by once +2: +ld1 {v0.16b}, [x1], x3 +ld1 {v1.16b}, [x2], x3 +subs
[FFmpeg-cvslog] lavc/aarch64: Provide neon implementation of nsse16
ffmpeg | branch: master | Hubert Mazur | Thu Sep 8 11:25:07 2022 +0200| [06b98e396adc467a5164a03d71dd71508a2d8881] | committer: Martin Storsjö lavc/aarch64: Provide neon implementation of nsse16 Add vectorized implementation of nsse16 function. Performance comparison tests are shown below. - nsse_0_c: 682.2 - nsse_0_neon: 116.5 Benchmarks and tests run with checkasm tool on AWS Graviton 3. Co-authored-by: Martin Storsjö Signed-off-by: Hubert Mazur Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=06b98e396adc467a5164a03d71dd71508a2d8881 --- libavcodec/aarch64/me_cmp_init_aarch64.c | 15 libavcodec/aarch64/me_cmp_neon.S | 122 +++ 2 files changed, 137 insertions(+) diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c index 8c295d5457..ade3e9a4c1 100644 --- a/libavcodec/aarch64/me_cmp_init_aarch64.c +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c @@ -49,6 +49,10 @@ int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2, ptrdiff_t stride, int h); int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy, ptrdiff_t stride, int h); +int nsse16_neon(int multiplier, const uint8_t *s, const uint8_t *s2, +ptrdiff_t stride, int h); +int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2, +ptrdiff_t stride, int h); av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) { @@ -72,5 +76,16 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) c->vsse[0] = vsse16_neon; c->vsse[4] = vsse_intra16_neon; + +c->nsse[0] = nsse16_neon_wrapper; } } + +int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2, +ptrdiff_t stride, int h) +{ +if (c) +return nsse16_neon(c->avctx->nsse_weight, s1, s2, stride, h); +else +return nsse16_neon(8, s1, s2, stride, h); +} diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index cf2b8da425..f8998749a5 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -847,3 +847,125 @@ function vsse_intra16_neon, export=1 ret endfunc + +function nsse16_neon, export=1 +// x0 multiplier +// x1 uint8_t *pix1 +// x2 uint8_t *pix2 +// x3 ptrdiff_t stride +// w4 int h + +str x0, [sp, #-0x40]! +stp x1, x2, [sp, #0x10] +stp x3, x4, [sp, #0x20] +str x30, [sp, #0x30] +bl X(sse16_neon) +ldr x30, [sp, #0x30] +mov w9, w0 // here we store score1 +ldr x5, [sp] +ldp x1, x2, [sp, #0x10] +ldp x3, x4, [sp, #0x20] +add sp, sp, #0x40 + +moviv16.8h, #0 +moviv17.8h, #0 +moviv18.8h, #0 +moviv19.8h, #0 + +ld1 {v0.16b}, [x1], x3 +subsw4, w4, #1 // we need to make h-1 iterations +ld1 {v2.16b}, [x2], x3 +ext v1.16b, v0.16b, v0.16b, #1 // x1 + 1 +cmp w4, #2 +ext v3.16b, v2.16b, v2.16b, #1 // x2 + 1 + +b.lt2f + +// make 2 iterations at once +1: +ld1 {v4.16b}, [x1], x3 +ld1 {v6.16b}, [x2], x3 +ld1 {v20.16b}, [x1], x3 +ext v5.16b, v4.16b, v4.16b, #1 // x1 + stride + 1 +usubl v31.8h, v0.8b, v4.8b +usubl2 v30.8h, v0.16b, v4.16b +ld1 {v22.16b}, [x2], x3 +usubl v29.8h, v1.8b, v5.8b +usubl2 v28.8h, v1.16b, v5.16b +ext v7.16b, v6.16b, v6.16b, #1 // x2 + stride + 1 +sabav16.8h, v31.8h, v29.8h +ext v21.16b, v20.16b, v20.16b, #1 +sabav17.8h, v30.8h, v28.8h +usubl v27.8h, v2.8b, v6.8b +usubl2 v26.8h, v2.16b, v6.16b +ext v23.16b, v22.16b, v22.16b, #1 +usubl v25.8h, v3.8b, v7.8b +usubl2 v24.8h, v3.16b, v7.16b +sabav18.8h, v27.8h, v25.8h +sabav19.8h, v26.8h, v24.8h + +usubl v31.8h, v4.8b, v20.8b +usubl2 v30.8h, v4.16b, v20.16b +usubl v29.8h, v5.8b, v21.8b +usubl2 v28.8h, v5.16b, v21.16b +sabav16.8h, v31.8h, v29.8h +sabav1
[FFmpeg-cvslog] lavc/aarch64: Add neon implementation for vsse_intra16
ffmpeg | branch: master | Hubert Mazur | Thu Sep 8 11:25:06 2022 +0200| [908abe8032d2e56f9b94a7ae387e415de4c29115] | committer: Martin Storsjö lavc/aarch64: Add neon implementation for vsse_intra16 Provide optimized implementation for vsse_intra16 for arm64. Performance tests are shown below. - vsse_4_c: 155.2 - vsse_4_neon: 36.2 Benchmarks and tests are run with checkasm tool on AWS Graviton 3. Signed-off-by: Hubert Mazur Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=908abe8032d2e56f9b94a7ae387e415de4c29115 --- libavcodec/aarch64/me_cmp_init_aarch64.c | 3 ++ libavcodec/aarch64/me_cmp_neon.S | 63 2 files changed, 66 insertions(+) diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c index af83f7ed1e..8c295d5457 100644 --- a/libavcodec/aarch64/me_cmp_init_aarch64.c +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c @@ -47,6 +47,8 @@ int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy, ptrdiff_t stride, int h) ; int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2, ptrdiff_t stride, int h); +int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy, + ptrdiff_t stride, int h); av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) { @@ -69,5 +71,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) c->vsad[4] = vsad_intra16_neon; c->vsse[0] = vsse16_neon; +c->vsse[4] = vsse_intra16_neon; } } diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index ce198ea227..cf2b8da425 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -784,3 +784,66 @@ function vsad_intra16_neon, export=1 ret endfunc + +function vsse_intra16_neon, export=1 +// x0 unused +// x1 uint8_t *pix1 +// x2 uint8_t *dummy +// x3 ptrdiff_t stride +// w4 int h + +ld1 {v0.16b}, [x1], x3 +moviv16.4s, #0 +moviv17.4s, #0 + +sub w4, w4, #1 // we need to make h-1 iterations +cmp w4, #3 +b.lt2f + +1: +// v = abs( pix1[0] - pix1[0 + stride] ) +// score = sum( v * v ) +ld1 {v1.16b}, [x1], x3 +ld1 {v2.16b}, [x1], x3 +uabdv30.16b, v0.16b, v1.16b +ld1 {v3.16b}, [x1], x3 +umull v29.8h, v30.8b, v30.8b +umull2 v28.8h, v30.16b, v30.16b +uabdv27.16b, v1.16b, v2.16b +uadalp v16.4s, v29.8h +umull v26.8h, v27.8b, v27.8b +umull2 v27.8h, v27.16b, v27.16b +uadalp v17.4s, v28.8h +uabdv25.16b, v2.16b, v3.16b +uadalp v16.4s, v26.8h +umull v24.8h, v25.8b, v25.8b +umull2 v25.8h, v25.16b, v25.16b +uadalp v17.4s, v27.8h +sub w4, w4, #3 +uadalp v16.4s, v24.8h +cmp w4, #3 +uadalp v17.4s, v25.8h +mov v0.16b, v3.16b + +b.ge1b +cbz w4, 3f + +// iterate by one +2: +ld1 {v1.16b}, [x1], x3 +subsw4, w4, #1 +uabdv30.16b, v0.16b, v1.16b +mov v0.16b, v1.16b +umull v29.8h, v30.8b, v30.8b +umull2 v30.8h, v30.16b, v30.16b +uadalp v16.4s, v29.8h +uadalp v17.4s, v30.8h +cbnzw4, 2b + +3: +add v16.4s, v16.4s, v17.4S +uaddlv d17, v16.4s +fmovw0, s17 + +ret +endfunc ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] tests/fate-run: Allow to set input options for encoding pass
ffmpeg | branch: master | Andreas Rheinhardt | Wed Sep 7 22:36:45 2022 +0200| [a5ab4be081aee22e675d0e78aa9ca0d08f4a5d6f] | committer: Andreas Rheinhardt tests/fate-run: Allow to set input options for encoding pass This will be useful in the next commit. Signed-off-by: Andreas Rheinhardt > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a5ab4be081aee22e675d0e78aa9ca0d08f4a5d6f --- tests/fate-run.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/fate-run.sh b/tests/fate-run.sh index 4008bcbc16..61cc59acc0 100755 --- a/tests/fate-run.sh +++ b/tests/fate-run.sh @@ -247,12 +247,13 @@ transcode(){ ffprobe_opts=$6 additional_input=$7 final_decode=$8 +enc_opt_in=$9 test -z "$additional_input" || additional_input="$DEC_OPTS $additional_input" encfile="${outdir}/${test}.${enc_fmt}" test $keep -ge 1 || cleanfiles="$cleanfiles $encfile" tsrcfile=$(target_path $srcfile) tencfile=$(target_path $encfile) -ffmpeg -f $src_fmt $DEC_OPTS -i $tsrcfile $additional_input \ +ffmpeg -f $src_fmt $DEC_OPTS $enc_opt_in -i $tsrcfile $additional_input \ $ENC_OPTS $enc_opt $FLAGS -f $enc_fmt -y $tencfile || return do_md5sum $encfile echo $(wc -c $encfile) ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] fate/matroska: Add test for updating AV1 extradata
ffmpeg | branch: master | Andreas Rheinhardt | Wed Sep 7 23:57:15 2022 +0200| [91e9a6df33d8b14577fe1ec9623d9d0466fdd7d3] | committer: Andreas Rheinhardt fate/matroska: Add test for updating AV1 extradata Signed-off-by: Andreas Rheinhardt > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=91e9a6df33d8b14577fe1ec9623d9d0466fdd7d3 --- tests/fate/matroska.mak | 6 ++ tests/ref/fate/webm-av1-extradata-update | 32 2 files changed, 38 insertions(+) diff --git a/tests/fate/matroska.mak b/tests/fate/matroska.mak index 63e81f121b..39137ad4be 100644 --- a/tests/fate/matroska.mak +++ b/tests/fate/matroska.mak @@ -49,6 +49,12 @@ FATE_MATROSKA-$(call ALLYES, FLAC_DECODER FLAC_ENCODER FLAC_PARSER \ fate-matroska-flac-extradata-update: CMD = transcode matroska $(TARGET_SAMPLES)/mkv/flac_channel_layouts.mka \ matroska "-map 0 -map 0:0 -c flac -frames:a:2 8" "-map 0 -c copy" +# This tests that the Matroska/WebM muxer writes the AV1 CodecPrivate +# via extradata obtained from packet side data. It also tests that +# the aspect ratio is only written with pixels as DisplayUnit for WebM. +FATE_MATROSKA-$(call REMUX, WEBM MATROSKA, IVF_DEMUXER AV1_PARSER EXTRACT_EXTRADATA_BSF) += fate-webm-av1-extradata-update +fate-webm-av1-extradata-update: CMD = transcode ivf $(TARGET_SAMPLES)/av1/decode_model.ivf webm "-c copy -bsf extract_extradata -sar 3:1" "-c copy" "" "" "-nofind_stream_info" "-nofind_stream_info" + # This test tests demuxing Vorbis and chapters from ogg and muxing it in and # demuxing it from Matroska/WebM. It furthermore tests the WebM muxer, in # particular its DASH mode. Finally, it tests writing the Cues at the front. diff --git a/tests/ref/fate/webm-av1-extradata-update b/tests/ref/fate/webm-av1-extradata-update new file mode 100644 index 00..9dd2056e0e --- /dev/null +++ b/tests/ref/fate/webm-av1-extradata-update @@ -0,0 +1,32 @@ +fbf3091fdf05b2856c578e7c948d68c3 *tests/data/fate/webm-av1-extradata-update.webm +23048 tests/data/fate/webm-av1-extradata-update.webm +#extradata 0: 35, 0x527207cd +#tb 0: 1/1000 +#media_type 0: video +#codec_id 0: av1 +#dimensions 0: 240x100 +#sar 0: 3/1 +0, 0, 0,0, 8168, 0x1851ab62 +0, 42, 42,0, 7040, 0x967788f9, F=0x0 +0, 83, 83,0,4, 0x01f400e2, F=0x0 +0,125,125,0, 48, 0x49ad107e, F=0x0 +0,167,167,0,4, 0x021c00fa, F=0x0 +0,208,208,0, 279, 0x69728439, F=0x0 +0,250,250,0,4, 0x01c400d2, F=0x0 +0,292,292,0, 63, 0x9bbf1836, F=0x0 +0,333,333,0,4, 0x026c012a, F=0x0 +0,375,375,0, 1065, 0xce2003ac, F=0x0 +0,417,417,0,4, 0x019400c2, F=0x0 +0,458,458,0, 52, 0x7a0112f1, F=0x0 +0,500,500,0,4, 0x021c010a, F=0x0 +0,542,542,0, 689, 0x1e8b49e7, F=0x0 +0,583,583,0,4, 0x01e400f2, F=0x0 +0,625,625,0, 209, 0x124c6790, F=0x0 +0,667,667,0, 42, 0xea690e31, F=0x0 +0,708,708,0, 3521, 0xd76ee284, F=0x0 +0,750,750,0, 63, 0x4572188f, F=0x0 +0,792,792,0, 386, 0xb078c259, F=0x0 +0,833,833,0, 178, 0x1ebb5121, F=0x0 +0,875,875,0, 60, 0x729317f7, F=0x0 +0,917,917,0, 40, 0xad970a66, F=0x0 +0,958,958,0, 61, 0xcc0d1a20, F=0x0 ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavu/tx: add the inplace flag to PFA FFTs
ffmpeg | branch: master | Lynne | Sat Sep 10 02:26:49 2022 +0200| [645a1f4422ad9c8c954e7c42bef2281cac96ab18] | committer: Lynne lavu/tx: add the inplace flag to PFA FFTs They support in-place, because they have to use a temporary buffer. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=645a1f4422ad9c8c954e7c42bef2281cac96ab18 --- libavutil/tx_template.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c index 35b61fa477..542c15e480 100644 --- a/libavutil/tx_template.c +++ b/libavutil/tx_template.c @@ -836,7 +836,7 @@ static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_##N##xM_def) = {\ .name = TX_NAME_STR("fft_pfa_" #N "xM"), \ .function = TX_NAME(ff_tx_fft_pfa_##N##xM), \ .type = TX_TYPE(FFT), \ -.flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE, \ +.flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE, \ .factors= { N, TX_FACTOR_ANY }, \ .min_len= N*2, \ .max_len= TX_LEN_UNLIMITED, \ ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavu/tx: generalize MDCTs
ffmpeg | branch: master | Lynne | Sat Sep 10 02:28:10 2022 +0200| [51172223fd1a5b71b46fc0d398f4fdc9ed081b83] | committer: Lynne lavu/tx: generalize MDCTs The same code can perform any-length MDCTs with minimal changes. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=51172223fd1a5b71b46fc0d398f4fdc9ed081b83 --- libavutil/tx_template.c | 75 ++--- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c index 542c15e480..1d4c4d294b 100644 --- a/libavutil/tx_template.c +++ b/libavutil/tx_template.c @@ -940,12 +940,12 @@ static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_inv_def) = { .prio = FF_TX_PRIO_MIN, }; -static av_cold int TX_NAME(ff_tx_mdct_sr_init)(AVTXContext *s, - const FFTXCodelet *cd, - uint64_t flags, - FFTXCodeletOptions *opts, - int len, int inv, - const void *scale) +static av_cold int TX_NAME(ff_tx_mdct_init)(AVTXContext *s, +const FFTXCodelet *cd, +uint64_t flags, +FFTXCodeletOptions *opts, +int len, int inv, +const void *scale) { int ret; FFTXCodeletOptions sub_opts = { .invert_lookup = inv }; @@ -955,32 +955,49 @@ static av_cold int TX_NAME(ff_tx_mdct_sr_init)(AVTXContext *s, flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ flags |= AV_TX_INPLACE; /* in-place */ -flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ +flags |= FF_TX_PRESHUFFLE; /* First try with an in-place transform */ if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1, -inv, scale))) -return ret; +inv, scale))) { +flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */ +if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1, +inv, scale))) +return ret; +} + +/* If we need to preshuffle just steal the map from the subcontext */ +if (s->sub[0].flags & FF_TX_PRESHUFFLE) { +s->map = s->sub[0].map; +s->sub[0].map = NULL; +} else { +s->map = av_malloc((len >> 1)*sizeof(*s->map)); +if (!s->map) +return AVERROR(ENOMEM); + +for (int i = 0; i < len >> 1; i++) +s->map[i] = i; +} -if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->sub->map : NULL))) +if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL))) return ret; /* Saves a multiply in a hot path. */ if (inv) for (int i = 0; i < (s->len >> 1); i++) -s->sub->map[i] <<= 1; +s->map[i] <<= 1; return 0; } -static void TX_NAME(ff_tx_mdct_sr_fwd)(AVTXContext *s, void *_dst, void *_src, - ptrdiff_t stride) +static void TX_NAME(ff_tx_mdct_fwd)(AVTXContext *s, void *_dst, void *_src, +ptrdiff_t stride) { TXSample *src = _src, *dst = _dst; TXComplex *exp = s->exp, tmp, *z = _dst; const int len2 = s->len >> 1; const int len4 = s->len >> 2; const int len3 = len2 * 3; -const int *sub_map = s->sub->map; +const int *sub_map = s->map; stride /= sizeof(*dst); @@ -1011,14 +1028,14 @@ static void TX_NAME(ff_tx_mdct_sr_fwd)(AVTXContext *s, void *_dst, void *_src, } } -static void TX_NAME(ff_tx_mdct_sr_inv)(AVTXContext *s, void *_dst, void *_src, - ptrdiff_t stride) +static void TX_NAME(ff_tx_mdct_inv)(AVTXContext *s, void *_dst, void *_src, +ptrdiff_t stride) { TXComplex *z = _dst, *exp = s->exp; const TXSample *src = _src, *in1, *in2; const int len2 = s->len >> 1; const int len4 = s->len >> 2; -const int *sub_map = s->sub->map; +const int *sub_map = s->map; stride /= sizeof(*src); in1 = src; @@ -1043,28 +1060,28 @@ static void TX_NAME(ff_tx_mdct_sr_inv)(AVTXContext *s, void *_dst, void *_src, } } -static const FFTXCodelet TX_NAME(ff_tx_mdct_sr_fwd_def) = { -.name = TX_NAME_STR("mdct_sr_fwd"), -.function = TX_NAME(ff_tx_mdct_sr_fwd), +static const FFTXCodelet TX_NAME(ff_tx_mdct_fwd_def) = { +.name = TX_NAME_STR("mdct_fwd"), +.function = TX_NAME(ff_tx_mdct_fwd), .type = TX_TYPE(MDCT), .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, -.factors[0] = 2, +.factors= { 2, TX_FACTOR_A
[FFmpeg-cvslog] lavu/tx: propagate the codelet flags into the context
ffmpeg | branch: master | Lynne | Sat Sep 10 02:26:02 2022 +0200| [8c283e8fe631135a0c36d50f9c8d558f43cfef7b] | committer: Lynne lavu/tx: propagate the codelet flags into the context The field is documented as a combination of both. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=8c283e8fe631135a0c36d50f9c8d558f43cfef7b --- libavutil/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavutil/tx.c b/libavutil/tx.c index da8ebddd9a..aeb0d9dada 100644 --- a/libavutil/tx.c +++ b/libavutil/tx.c @@ -620,7 +620,7 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, sctx->len= len; sctx->inv= inv; sctx->type = type; -sctx->flags = flags; +sctx->flags = cd->flags | flags; sctx->cd_self= cd; s->fn[s->nb_sub] = cd->function; ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavu/tx: rotate 3 & 15-point exptabs
ffmpeg | branch: master | Lynne | Sat Sep 10 02:31:43 2022 +0200| [c92edd969aaf8b12434ff4bd731aa4bc5548fbbf] | committer: Lynne lavu/tx: rotate 3 & 15-point exptabs This just inverts their signs. Simplifies SIMD. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c92edd969aaf8b12434ff4bd731aa4bc5548fbbf --- libavutil/tx_template.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c index 1d4c4d294b..0c7ddd26f6 100644 --- a/libavutil/tx_template.c +++ b/libavutil/tx_template.c @@ -109,11 +109,11 @@ static av_cold void TX_TAB(ff_tx_init_tab_53)(void) TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 12)); TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 12)); TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 6)); -TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 6)); +TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(8 * M_PI / 6)); TX_TAB(ff_tx_tab_53)[4] = RESCALE(cos(2 * M_PI / 5)); -TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI / 5)); +TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(8 * M_PI / 5)); TX_TAB(ff_tx_tab_53)[6] = RESCALE(cos(2 * M_PI / 10)); -TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10)); +TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(6 * M_PI / 5)); } static av_cold void TX_TAB(ff_tx_init_tab_7)(void) @@ -226,8 +226,8 @@ static av_always_inline void NAME(TXComplex *out, TXComplex *in,\ \ SMUL(t[4].re, t[0].re, tab[4], tab[6], t[2].re, t[0].re); \ SMUL(t[4].im, t[0].im, tab[4], tab[6], t[2].im, t[0].im); \ -CMUL(t[5].re, t[1].re, tab[5], tab[7], t[3].re, t[1].re); \ -CMUL(t[5].im, t[1].im, tab[5], tab[7], t[3].im, t[1].im); \ +CMUL(t[5].re, t[1].re, -tab[5], -tab[7], t[3].re, t[1].re); \ +CMUL(t[5].im, t[1].im, -tab[5], -tab[7], t[3].im, t[1].im); \ \ BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \ BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \ ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".