[FFmpeg-cvslog] lavc/aarch64: Add neon implementation for vsad_intra16

2022-09-09 Thread Hubert Mazur
ffmpeg | branch: master | Hubert Mazur  | Thu Sep  8 
11:25:05 2022 +0200| [ce03ea3e796bdf9013da51d53e89759a92707c4a] | committer: 
Martin Storsjö

lavc/aarch64: Add neon implementation for vsad_intra16

Provide optimized implementation for vsad_intra16 function for arm64.

Performance comparison tests are shown below.
- vsad_4_c: 177.5
- vsad_4_neon: 23.5

Benchmarks and tests are run with checkasm tool on AWS Gravtion 3.

Signed-off-by: Hubert Mazur 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ce03ea3e796bdf9013da51d53e89759a92707c4a
---

 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
 libavcodec/aarch64/me_cmp_neon.S | 48 
 2 files changed, 51 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c 
b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 7b81e48d16..af83f7ed1e 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -43,6 +43,8 @@ int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const 
uint8_t *pix2,
 
 int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
 ptrdiff_t stride, int h);
+int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t 
*dummy,
+  ptrdiff_t stride, int h) ;
 int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
 ptrdiff_t stride, int h);
 
@@ -64,6 +66,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, 
AVCodecContext *avctx)
 c->sse[2] = sse4_neon;
 
 c->vsad[0] = vsad16_neon;
+c->vsad[4] = vsad_intra16_neon;
 
 c->vsse[0] = vsse16_neon;
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index b3f376aa60..ce198ea227 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -736,3 +736,51 @@ function vsse16_neon, export=1
 
 ret
 endfunc
+
+function vsad_intra16_neon, export=1
+// x0   unused
+// x1   uint8_t *pix1
+// x2   uint8_t *dummy
+// x3   ptrdiff_t stride
+// w4   int h
+
+ld1 {v0.16b}, [x1], x3
+sub w4, w4, #1 // we need to make h-1 iterations
+cmp w4, #3
+moviv16.8h, #0
+b.lt2f
+
+// make 4 iterations at once
+1:
+// v = abs( pix1[0] - pix1[0 + stride] )
+// score = sum(v)
+ld1 {v1.16b}, [x1], x3
+ld1 {v2.16b}, [x1], x3
+uabal   v16.8h, v0.8b, v1.8b
+ld1 {v3.16b}, [x1], x3
+uabal2  v16.8h, v0.16b, v1.16b
+sub w4, w4, #3
+uabal   v16.8h, v1.8b, v2.8b
+cmp w4, #3
+uabal2  v16.8h, v1.16b, v2.16b
+mov v0.16b, v3.16b
+uabal   v16.8h, v2.8b, v3.8b
+uabal2  v16.8h, v2.16b, v3.16b
+b.ge1b
+cbz w4, 3f
+
+// iterate by one
+2:
+ld1 {v1.16b}, [x1], x3
+subsw4, w4, #1
+uabal   v16.8h, v0.8b, v1.8b
+uabal2  v16.8h, v0.16b, v1.16b
+mov v0.16b, v1.16b
+cbnzw4, 2b
+
+3:
+uaddlv  s17, v16.8h
+fmovw0, s17
+
+ret
+endfunc

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] lavc/aarch64: Add neon implementation for vsad16

2022-09-09 Thread Hubert Mazur
ffmpeg | branch: master | Hubert Mazur  | Thu Sep  8 
11:25:03 2022 +0200| [200f5e578f2fbf70a966f08257e0500a6f1ddd6c] | committer: 
Martin Storsjö

lavc/aarch64: Add neon implementation for vsad16

Provide optimized implementation of vsad16 function for arm64.

Performance comparison tests are shown below.
- vsad_0_c: 285.2
- vsad_0_neon: 39.5

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Co-authored-by: Martin Storsjö 
Signed-off-by: Hubert Mazur 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=200f5e578f2fbf70a966f08257e0500a6f1ddd6c
---

 libavcodec/aarch64/me_cmp_init_aarch64.c |  5 +++
 libavcodec/aarch64/me_cmp_neon.S | 65 
 2 files changed, 70 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c 
b/libavcodec/aarch64/me_cmp_init_aarch64.c
index fb7c3f5059..ddc5d05611 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -41,6 +41,9 @@ int sse8_neon(MpegEncContext *v, const uint8_t *pix1, const 
uint8_t *pix2,
 int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
   ptrdiff_t stride, int h);
 
+int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ptrdiff_t stride, int h);
+
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
 int cpu_flags = av_get_cpu_flags();
@@ -57,5 +60,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, 
AVCodecContext *avctx)
 c->sse[0] = sse16_neon;
 c->sse[1] = sse8_neon;
 c->sse[2] = sse4_neon;
+
+c->vsad[0] = vsad16_neon;
 }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 4198985c6c..1d0b166d69 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -584,3 +584,68 @@ function sse4_neon, export=1
 
 ret
 endfunc
+
+function vsad16_neon, export=1
+// x0   unused
+// x1   uint8_t *pix1
+// x2   uint8_t *pix2
+// x3   ptrdiff_t stride
+// w4   int h
+
+ld1 {v0.16b}, [x1], x3  // Load pix1[0], first 
iteration
+ld1 {v1.16b}, [x2], x3  // Load pix2[0], first 
iteration
+
+sub w4, w4, #1  // we need to make h-1 
iterations
+moviv16.8h, #0
+
+cmp w4, #3  // check if we can 
make 3 iterations at once
+usubl   v31.8h, v0.8b, v1.8b// Signed difference 
pix1[0] - pix2[0], first iteration
+usubl2  v30.8h, v0.16b, v1.16b  // Signed difference 
pix1[0] - pix2[0], first iteration
+
+b.lt2f
+
+1:
+// abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
+ld1 {v0.16b}, [x1], x3  // Load pix1[0 + 
stride], first iteration
+ld1 {v1.16b}, [x2], x3  // Load pix2[0 + 
stride], first iteration
+ld1 {v2.16b}, [x1], x3  // Load pix1[0 + 
stride], second iteration
+ld1 {v3.16b}, [x2], x3  // Load pix2[0 + 
stride], second iteration
+usubl   v29.8h, v0.8b,  v1.8b   // Signed difference 
pix1[0 + stride] - pix2[0 + stride], first iteration
+usubl2  v28.8h, v0.16b, v1.16b  // Signed difference 
pix1[0 + stride] - pix2[0 + stride], first iteration
+ld1 {v4.16b}, [x1], x3  // Load pix1[0 + 
stride], third iteration
+ld1 {v5.16b}, [x2], x3  // Load pix2[0 + 
stride], third iteration
+usubl   v27.8h, v2.8b,  v3.8b   // Signed difference 
pix1[0 + stride] - pix2[0 + stride], second iteration
+sabav16.8h, v31.8h, v29.8h  // Signed absolute 
difference and accumulate the result. first iteration
+usubl2  v26.8h, v2.16b, v3.16b  // Signed difference 
pix1[0 + stride] - pix2[0 + stride], second iteration
+sabav16.8h, v30.8h, v28.8h  // Signed absolute 
difference and accumulate the result. first iteration
+usubl   v25.8h, v4.8b,  v5.8b   // Signed difference 
pix1[0 + stride] - pix2[0 + stride], third iteration
+usubl2  v24.8h, v4.16b, v5.16b  // Signed difference 
pix1[0 + stride] - pix2[0 + stride], third iteration
+sabav16.8h, v29.8h, v27.8h  // Signed absolute 
difference and accumulate the result. second iteration
+mov v31.16b, v25.16b
+sabav16.8h, v28.8h, v26.8h  // Signed absolute 
difference and accumulate the result. second iteration
+sub w4, w4, #3  // h -= 3
+mov 

[FFmpeg-cvslog] lavc/aarch64: Add neon implementation of vsse16

2022-09-09 Thread Hubert Mazur
ffmpeg | branch: master | Hubert Mazur  | Thu Sep  8 
11:25:04 2022 +0200| [c495a4b32d352e087318d3a09a9bb4f2b55cfa04] | committer: 
Martin Storsjö

lavc/aarch64: Add neon implementation of vsse16

Provide optimized implementation of vsse16 for arm64.

Performance comparison tests are shown below.
- vsse_0_c: 257.7
- vsse_0_neon: 59.2

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c495a4b32d352e087318d3a09a9bb4f2b55cfa04
---

 libavcodec/aarch64/me_cmp_init_aarch64.c |  4 ++
 libavcodec/aarch64/me_cmp_neon.S | 87 
 2 files changed, 91 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c 
b/libavcodec/aarch64/me_cmp_init_aarch64.c
index ddc5d05611..7b81e48d16 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -43,6 +43,8 @@ int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const 
uint8_t *pix2,
 
 int vsad16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
 ptrdiff_t stride, int h);
+int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
+ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -62,5 +64,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, 
AVCodecContext *avctx)
 c->sse[2] = sse4_neon;
 
 c->vsad[0] = vsad16_neon;
+
+c->vsse[0] = vsse16_neon;
 }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 1d0b166d69..b3f376aa60 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -649,3 +649,90 @@ function vsad16_neon, export=1
 
 ret
 endfunc
+
+function vsse16_neon, export=1
+// x0   unused
+// x1   uint8_t *pix1
+// x2   uint8_t *pix2
+// x3   ptrdiff_t stride
+// w4   int h
+
+ld1 {v0.16b}, [x1], x3  // Load pix1[0], first 
iteration
+ld1 {v1.16b}, [x2], x3  // Load pix2[0], first 
iteration
+
+sub w4, w4, #1  // we need to make h-1 
iterations
+moviv16.4s, #0
+moviv17.4s, #0
+
+cmp w4, #3  // check if we can 
make 3 iterations at once
+usubl   v31.8h, v0.8b, v1.8b// Signed difference 
of pix1[0] - pix2[0], first iteration
+usubl2  v30.8h, v0.16b, v1.16b  // Signed difference 
of pix1[0] - pix2[0], first iteration
+b.le2f
+
+
+1:
+// x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
+// res = (x) * (x)
+ld1 {v0.16b}, [x1], x3  // Load pix1[0 + 
stride], first iteration
+ld1 {v1.16b}, [x2], x3  // Load pix2[0 + 
stride], first iteration
+ld1 {v2.16b}, [x1], x3  // Load pix1[0 + 
stride], second iteration
+ld1 {v3.16b}, [x2], x3  // Load pix2[0 + 
stride], second iteration
+usubl   v29.8h, v0.8b, v1.8b
+usubl2  v28.8h, v0.16b, v1.16b
+ld1 {v4.16b}, [x1], x3  // Load pix1[0 + 
stride], third iteration
+ld1 {v5.16b}, [x2], x3  // Load pix1[0 + 
stride], third iteration
+sabdv31.8h, v31.8h, v29.8h
+sabdv30.8h, v30.8h, v28.8h
+usubl   v27.8h, v2.8b, v3.8b
+usubl2  v26.8h, v2.16b, v3.16b
+usubl   v25.8h, v4.8b, v5.8b
+usubl2  v24.8h, v4.16b, v5.16b
+sabdv29.8h, v29.8h, v27.8h
+sabdv27.8h, v27.8h, v25.8h
+umlal   v16.4s, v31.4h, v31.4h
+umlal2  v17.4s, v31.8h, v31.8h
+sabdv28.8h, v28.8h, v26.8h
+sabdv26.8h, v26.8h, v24.8h
+umlal   v16.4s, v30.4h, v30.4h
+umlal2  v17.4s, v30.8h, v30.8h
+mov v31.16b, v25.16b
+umlal   v16.4s, v29.4h, v29.4h
+umlal2  v17.4s, v29.8h, v29.8h
+mov v30.16b, v24.16b
+umlal   v16.4s, v28.4h, v28.4h
+umlal2  v17.4s, v28.8h, v28.8h
+sub w4, w4, #3
+umlal   v16.4s, v27.4h, v27.4h
+umlal2  v17.4s, v27.8h, v27.8h
+cmp w4, #3
+umlal   v16.4s, v26.4h, v26.4h
+umlal2  v17.4s, v26.8h, v26.8h
+
+b.ge1b
+
+cbz w4, 3f
+
+// iterate by once
+2:
+ld1 {v0.16b}, [x1], x3
+ld1 {v1.16b}, [x2], x3
+subs 

[FFmpeg-cvslog] lavc/aarch64: Provide neon implementation of nsse16

2022-09-09 Thread Hubert Mazur
ffmpeg | branch: master | Hubert Mazur  | Thu Sep  8 
11:25:07 2022 +0200| [06b98e396adc467a5164a03d71dd71508a2d8881] | committer: 
Martin Storsjö

lavc/aarch64: Provide neon implementation of nsse16

Add vectorized implementation of nsse16 function.

Performance comparison tests are shown below.
- nsse_0_c: 682.2
- nsse_0_neon: 116.5

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Co-authored-by: Martin Storsjö 
Signed-off-by: Hubert Mazur 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=06b98e396adc467a5164a03d71dd71508a2d8881
---

 libavcodec/aarch64/me_cmp_init_aarch64.c |  15 
 libavcodec/aarch64/me_cmp_neon.S | 122 +++
 2 files changed, 137 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c 
b/libavcodec/aarch64/me_cmp_init_aarch64.c
index 8c295d5457..ade3e9a4c1 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -49,6 +49,10 @@ int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const 
uint8_t *s2,
 ptrdiff_t stride, int h);
 int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t 
*dummy,
   ptrdiff_t stride, int h);
+int nsse16_neon(int multiplier, const uint8_t *s, const uint8_t *s2,
+ptrdiff_t stride, int h);
+int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t 
*s2,
+ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -72,5 +76,16 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, 
AVCodecContext *avctx)
 
 c->vsse[0] = vsse16_neon;
 c->vsse[4] = vsse_intra16_neon;
+
+c->nsse[0] = nsse16_neon_wrapper;
 }
 }
+
+int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t 
*s2,
+ptrdiff_t stride, int h)
+{
+if (c)
+return nsse16_neon(c->avctx->nsse_weight, s1, s2, stride, h);
+else
+return nsse16_neon(8, s1, s2, stride, h);
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index cf2b8da425..f8998749a5 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -847,3 +847,125 @@ function vsse_intra16_neon, export=1
 
 ret
 endfunc
+
+function nsse16_neon, export=1
+// x0   multiplier
+// x1   uint8_t *pix1
+// x2   uint8_t *pix2
+// x3   ptrdiff_t stride
+// w4   int h
+
+str x0, [sp, #-0x40]!
+stp x1, x2, [sp, #0x10]
+stp x3, x4, [sp, #0x20]
+str x30, [sp, #0x30]
+bl  X(sse16_neon)
+ldr x30, [sp, #0x30]
+mov w9, w0  // here we 
store score1
+ldr x5, [sp]
+ldp x1, x2, [sp, #0x10]
+ldp x3, x4, [sp, #0x20]
+add sp, sp, #0x40
+
+moviv16.8h, #0
+moviv17.8h, #0
+moviv18.8h, #0
+moviv19.8h, #0
+
+ld1 {v0.16b}, [x1], x3
+subsw4, w4, #1  // we need to 
make h-1 iterations
+ld1 {v2.16b}, [x2], x3
+ext v1.16b, v0.16b, v0.16b, #1  // x1 + 1
+cmp w4, #2
+ext v3.16b, v2.16b, v2.16b, #1  // x2 + 1
+
+b.lt2f
+
+// make 2 iterations at once
+1:
+ld1 {v4.16b}, [x1], x3
+ld1 {v6.16b}, [x2], x3
+ld1 {v20.16b}, [x1], x3
+ext v5.16b, v4.16b, v4.16b, #1  // x1 + stride 
+ 1
+usubl   v31.8h, v0.8b, v4.8b
+usubl2  v30.8h, v0.16b, v4.16b
+ld1 {v22.16b}, [x2], x3
+usubl   v29.8h, v1.8b, v5.8b
+usubl2  v28.8h, v1.16b, v5.16b
+ext v7.16b, v6.16b, v6.16b, #1  // x2 + stride 
+ 1
+sabav16.8h, v31.8h, v29.8h
+ext v21.16b, v20.16b, v20.16b, #1
+sabav17.8h, v30.8h, v28.8h
+usubl   v27.8h, v2.8b, v6.8b
+usubl2  v26.8h, v2.16b, v6.16b
+ext v23.16b, v22.16b, v22.16b, #1
+usubl   v25.8h, v3.8b, v7.8b
+usubl2  v24.8h, v3.16b, v7.16b
+sabav18.8h, v27.8h, v25.8h
+sabav19.8h, v26.8h, v24.8h
+
+usubl   v31.8h, v4.8b, v20.8b
+usubl2  v30.8h, v4.16b, v20.16b
+usubl   v29.8h, v5.8b, v21.8b
+usubl2  v28.8h, v5.16b, v21.16b
+sabav16.8h, v31.8h, v29.8h
+sabav1

[FFmpeg-cvslog] lavc/aarch64: Add neon implementation for vsse_intra16

2022-09-09 Thread Hubert Mazur
ffmpeg | branch: master | Hubert Mazur  | Thu Sep  8 
11:25:06 2022 +0200| [908abe8032d2e56f9b94a7ae387e415de4c29115] | committer: 
Martin Storsjö

lavc/aarch64: Add neon implementation for vsse_intra16

Provide optimized implementation for vsse_intra16 for arm64.

Performance tests are shown below.
- vsse_4_c: 155.2
- vsse_4_neon: 36.2

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=908abe8032d2e56f9b94a7ae387e415de4c29115
---

 libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
 libavcodec/aarch64/me_cmp_neon.S | 63 
 2 files changed, 66 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c 
b/libavcodec/aarch64/me_cmp_init_aarch64.c
index af83f7ed1e..8c295d5457 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -47,6 +47,8 @@ int vsad_intra16_neon(MpegEncContext *c, const uint8_t *s, 
const uint8_t *dummy,
   ptrdiff_t stride, int h) ;
 int vsse16_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
 ptrdiff_t stride, int h);
+int vsse_intra16_neon(MpegEncContext *c, const uint8_t *s, const uint8_t 
*dummy,
+  ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -69,5 +71,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, 
AVCodecContext *avctx)
 c->vsad[4] = vsad_intra16_neon;
 
 c->vsse[0] = vsse16_neon;
+c->vsse[4] = vsse_intra16_neon;
 }
 }
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index ce198ea227..cf2b8da425 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -784,3 +784,66 @@ function vsad_intra16_neon, export=1
 
 ret
 endfunc
+
+function vsse_intra16_neon, export=1
+// x0   unused
+// x1   uint8_t *pix1
+// x2   uint8_t *dummy
+// x3   ptrdiff_t stride
+// w4   int h
+
+ld1 {v0.16b}, [x1], x3
+moviv16.4s, #0
+moviv17.4s, #0
+
+sub w4, w4, #1 // we need to make h-1 iterations
+cmp w4, #3
+b.lt2f
+
+1:
+// v = abs( pix1[0] - pix1[0 + stride] )
+// score = sum( v * v )
+ld1 {v1.16b}, [x1], x3
+ld1 {v2.16b}, [x1], x3
+uabdv30.16b, v0.16b, v1.16b
+ld1 {v3.16b}, [x1], x3
+umull   v29.8h, v30.8b, v30.8b
+umull2  v28.8h, v30.16b, v30.16b
+uabdv27.16b, v1.16b, v2.16b
+uadalp  v16.4s, v29.8h
+umull   v26.8h, v27.8b, v27.8b
+umull2  v27.8h, v27.16b, v27.16b
+uadalp  v17.4s, v28.8h
+uabdv25.16b, v2.16b, v3.16b
+uadalp  v16.4s, v26.8h
+umull   v24.8h, v25.8b, v25.8b
+umull2  v25.8h, v25.16b, v25.16b
+uadalp  v17.4s, v27.8h
+sub w4, w4, #3
+uadalp  v16.4s, v24.8h
+cmp w4, #3
+uadalp  v17.4s, v25.8h
+mov v0.16b, v3.16b
+
+b.ge1b
+cbz w4, 3f
+
+// iterate by one
+2:
+ld1 {v1.16b}, [x1], x3
+subsw4, w4, #1
+uabdv30.16b, v0.16b, v1.16b
+mov v0.16b, v1.16b
+umull   v29.8h, v30.8b, v30.8b
+umull2  v30.8h, v30.16b, v30.16b
+uadalp  v16.4s, v29.8h
+uadalp  v17.4s, v30.8h
+cbnzw4, 2b
+
+3:
+add v16.4s, v16.4s, v17.4S
+uaddlv  d17, v16.4s
+fmovw0, s17
+
+ret
+endfunc

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] tests/fate-run: Allow to set input options for encoding pass

2022-09-09 Thread Andreas Rheinhardt
ffmpeg | branch: master | Andreas Rheinhardt  | 
Wed Sep  7 22:36:45 2022 +0200| [a5ab4be081aee22e675d0e78aa9ca0d08f4a5d6f] | 
committer: Andreas Rheinhardt

tests/fate-run: Allow to set input options for encoding pass

This will be useful in the next commit.

Signed-off-by: Andreas Rheinhardt 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a5ab4be081aee22e675d0e78aa9ca0d08f4a5d6f
---

 tests/fate-run.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/fate-run.sh b/tests/fate-run.sh
index 4008bcbc16..61cc59acc0 100755
--- a/tests/fate-run.sh
+++ b/tests/fate-run.sh
@@ -247,12 +247,13 @@ transcode(){
 ffprobe_opts=$6
 additional_input=$7
 final_decode=$8
+enc_opt_in=$9
 test -z "$additional_input" || additional_input="$DEC_OPTS 
$additional_input"
 encfile="${outdir}/${test}.${enc_fmt}"
 test $keep -ge 1 || cleanfiles="$cleanfiles $encfile"
 tsrcfile=$(target_path $srcfile)
 tencfile=$(target_path $encfile)
-ffmpeg -f $src_fmt $DEC_OPTS -i $tsrcfile $additional_input \
+ffmpeg -f $src_fmt $DEC_OPTS $enc_opt_in -i $tsrcfile $additional_input \
$ENC_OPTS $enc_opt $FLAGS -f $enc_fmt -y $tencfile || return
 do_md5sum $encfile
 echo $(wc -c $encfile)

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] fate/matroska: Add test for updating AV1 extradata

2022-09-09 Thread Andreas Rheinhardt
ffmpeg | branch: master | Andreas Rheinhardt  | 
Wed Sep  7 23:57:15 2022 +0200| [91e9a6df33d8b14577fe1ec9623d9d0466fdd7d3] | 
committer: Andreas Rheinhardt

fate/matroska: Add test for updating AV1 extradata

Signed-off-by: Andreas Rheinhardt 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=91e9a6df33d8b14577fe1ec9623d9d0466fdd7d3
---

 tests/fate/matroska.mak  |  6 ++
 tests/ref/fate/webm-av1-extradata-update | 32 
 2 files changed, 38 insertions(+)

diff --git a/tests/fate/matroska.mak b/tests/fate/matroska.mak
index 63e81f121b..39137ad4be 100644
--- a/tests/fate/matroska.mak
+++ b/tests/fate/matroska.mak
@@ -49,6 +49,12 @@ FATE_MATROSKA-$(call ALLYES, FLAC_DECODER FLAC_ENCODER 
FLAC_PARSER \
 fate-matroska-flac-extradata-update: CMD = transcode matroska 
$(TARGET_SAMPLES)/mkv/flac_channel_layouts.mka \
matroska "-map 0 -map 0:0 -c flac 
-frames:a:2 8" "-map 0 -c copy"
 
+# This tests that the Matroska/WebM muxer writes the AV1 CodecPrivate
+# via extradata obtained from packet side data. It also tests that
+# the aspect ratio is only written with pixels as DisplayUnit for WebM.
+FATE_MATROSKA-$(call REMUX, WEBM MATROSKA, IVF_DEMUXER AV1_PARSER 
EXTRACT_EXTRADATA_BSF) += fate-webm-av1-extradata-update
+fate-webm-av1-extradata-update: CMD = transcode ivf 
$(TARGET_SAMPLES)/av1/decode_model.ivf webm "-c copy -bsf extract_extradata 
-sar 3:1" "-c copy" "" "" "-nofind_stream_info" "-nofind_stream_info"
+
 # This test tests demuxing Vorbis and chapters from ogg and muxing it in and
 # demuxing it from Matroska/WebM. It furthermore tests the WebM muxer, in
 # particular its DASH mode. Finally, it tests writing the Cues at the front.
diff --git a/tests/ref/fate/webm-av1-extradata-update 
b/tests/ref/fate/webm-av1-extradata-update
new file mode 100644
index 00..9dd2056e0e
--- /dev/null
+++ b/tests/ref/fate/webm-av1-extradata-update
@@ -0,0 +1,32 @@
+fbf3091fdf05b2856c578e7c948d68c3 
*tests/data/fate/webm-av1-extradata-update.webm
+23048 tests/data/fate/webm-av1-extradata-update.webm
+#extradata 0:   35, 0x527207cd
+#tb 0: 1/1000
+#media_type 0: video
+#codec_id 0: av1
+#dimensions 0: 240x100
+#sar 0: 3/1
+0,  0,  0,0, 8168, 0x1851ab62
+0, 42, 42,0, 7040, 0x967788f9, F=0x0
+0, 83, 83,0,4, 0x01f400e2, F=0x0
+0,125,125,0,   48, 0x49ad107e, F=0x0
+0,167,167,0,4, 0x021c00fa, F=0x0
+0,208,208,0,  279, 0x69728439, F=0x0
+0,250,250,0,4, 0x01c400d2, F=0x0
+0,292,292,0,   63, 0x9bbf1836, F=0x0
+0,333,333,0,4, 0x026c012a, F=0x0
+0,375,375,0, 1065, 0xce2003ac, F=0x0
+0,417,417,0,4, 0x019400c2, F=0x0
+0,458,458,0,   52, 0x7a0112f1, F=0x0
+0,500,500,0,4, 0x021c010a, F=0x0
+0,542,542,0,  689, 0x1e8b49e7, F=0x0
+0,583,583,0,4, 0x01e400f2, F=0x0
+0,625,625,0,  209, 0x124c6790, F=0x0
+0,667,667,0,   42, 0xea690e31, F=0x0
+0,708,708,0, 3521, 0xd76ee284, F=0x0
+0,750,750,0,   63, 0x4572188f, F=0x0
+0,792,792,0,  386, 0xb078c259, F=0x0
+0,833,833,0,  178, 0x1ebb5121, F=0x0
+0,875,875,0,   60, 0x729317f7, F=0x0
+0,917,917,0,   40, 0xad970a66, F=0x0
+0,958,958,0,   61, 0xcc0d1a20, F=0x0

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] lavu/tx: add the inplace flag to PFA FFTs

2022-09-09 Thread Lynne
ffmpeg | branch: master | Lynne  | Sat Sep 10 02:26:49 2022 
+0200| [645a1f4422ad9c8c954e7c42bef2281cac96ab18] | committer: Lynne

lavu/tx: add the inplace flag to PFA FFTs

They support in-place, because they have to use a temporary buffer.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=645a1f4422ad9c8c954e7c42bef2281cac96ab18
---

 libavutil/tx_template.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index 35b61fa477..542c15e480 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -836,7 +836,7 @@ static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_##N##xM_def) 
= {\
 .name   = TX_NAME_STR("fft_pfa_" #N "xM"), 
\
 .function   = TX_NAME(ff_tx_fft_pfa_##N##xM),  
\
 .type   = TX_TYPE(FFT),
\
-.flags  = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE,
\
+.flags  = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
\
 .factors= { N, TX_FACTOR_ANY },
\
 .min_len= N*2, 
\
 .max_len= TX_LEN_UNLIMITED,
\

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] lavu/tx: generalize MDCTs

2022-09-09 Thread Lynne
ffmpeg | branch: master | Lynne  | Sat Sep 10 02:28:10 2022 
+0200| [51172223fd1a5b71b46fc0d398f4fdc9ed081b83] | committer: Lynne

lavu/tx: generalize MDCTs

The same code can perform any-length MDCTs with minimal changes.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=51172223fd1a5b71b46fc0d398f4fdc9ed081b83
---

 libavutil/tx_template.c | 75 ++---
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index 542c15e480..1d4c4d294b 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -940,12 +940,12 @@ static const FFTXCodelet 
TX_NAME(ff_tx_mdct_naive_inv_def) = {
 .prio   = FF_TX_PRIO_MIN,
 };
 
-static av_cold int TX_NAME(ff_tx_mdct_sr_init)(AVTXContext *s,
-   const FFTXCodelet *cd,
-   uint64_t flags,
-   FFTXCodeletOptions *opts,
-   int len, int inv,
-   const void *scale)
+static av_cold int TX_NAME(ff_tx_mdct_init)(AVTXContext *s,
+const FFTXCodelet *cd,
+uint64_t flags,
+FFTXCodeletOptions *opts,
+int len, int inv,
+const void *scale)
 {
 int ret;
 FFTXCodeletOptions sub_opts = { .invert_lookup = inv };
@@ -955,32 +955,49 @@ static av_cold int 
TX_NAME(ff_tx_mdct_sr_init)(AVTXContext *s,
 
 flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
 flags |=  AV_TX_INPLACE;  /* in-place */
-flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
+flags |=  FF_TX_PRESHUFFLE;   /* First try with an in-place transform */
 
 if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
-inv, scale)))
-return ret;
+inv, scale))) {
+flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
+if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 
1,
+inv, scale)))
+return ret;
+}
+
+/* If we need to preshuffle just steal the map from the subcontext */
+if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
+s->map = s->sub[0].map;
+s->sub[0].map = NULL;
+} else {
+s->map = av_malloc((len >> 1)*sizeof(*s->map));
+if (!s->map)
+return AVERROR(ENOMEM);
+
+for (int i = 0; i < len >> 1; i++)
+s->map[i] = i;
+}
 
-if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->sub->map : NULL)))
+if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
 return ret;
 
 /* Saves a multiply in a hot path. */
 if (inv)
 for (int i = 0; i < (s->len >> 1); i++)
-s->sub->map[i] <<= 1;
+s->map[i] <<= 1;
 
 return 0;
 }
 
-static void TX_NAME(ff_tx_mdct_sr_fwd)(AVTXContext *s, void *_dst, void *_src,
-   ptrdiff_t stride)
+static void TX_NAME(ff_tx_mdct_fwd)(AVTXContext *s, void *_dst, void *_src,
+ptrdiff_t stride)
 {
 TXSample *src = _src, *dst = _dst;
 TXComplex *exp = s->exp, tmp, *z = _dst;
 const int len2 = s->len >> 1;
 const int len4 = s->len >> 2;
 const int len3 = len2 * 3;
-const int *sub_map = s->sub->map;
+const int *sub_map = s->map;
 
 stride /= sizeof(*dst);
 
@@ -1011,14 +1028,14 @@ static void TX_NAME(ff_tx_mdct_sr_fwd)(AVTXContext *s, 
void *_dst, void *_src,
 }
 }
 
-static void TX_NAME(ff_tx_mdct_sr_inv)(AVTXContext *s, void *_dst, void *_src,
-   ptrdiff_t stride)
+static void TX_NAME(ff_tx_mdct_inv)(AVTXContext *s, void *_dst, void *_src,
+ptrdiff_t stride)
 {
 TXComplex *z = _dst, *exp = s->exp;
 const TXSample *src = _src, *in1, *in2;
 const int len2 = s->len >> 1;
 const int len4 = s->len >> 2;
-const int *sub_map = s->sub->map;
+const int *sub_map = s->map;
 
 stride /= sizeof(*src);
 in1 = src;
@@ -1043,28 +1060,28 @@ static void TX_NAME(ff_tx_mdct_sr_inv)(AVTXContext *s, 
void *_dst, void *_src,
 }
 }
 
-static const FFTXCodelet TX_NAME(ff_tx_mdct_sr_fwd_def) = {
-.name   = TX_NAME_STR("mdct_sr_fwd"),
-.function   = TX_NAME(ff_tx_mdct_sr_fwd),
+static const FFTXCodelet TX_NAME(ff_tx_mdct_fwd_def) = {
+.name   = TX_NAME_STR("mdct_fwd"),
+.function   = TX_NAME(ff_tx_mdct_fwd),
 .type   = TX_TYPE(MDCT),
 .flags  = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
-.factors[0] = 2,
+.factors= { 2, TX_FACTOR_A

[FFmpeg-cvslog] lavu/tx: propagate the codelet flags into the context

2022-09-09 Thread Lynne
ffmpeg | branch: master | Lynne  | Sat Sep 10 02:26:02 2022 
+0200| [8c283e8fe631135a0c36d50f9c8d558f43cfef7b] | committer: Lynne

lavu/tx: propagate the codelet flags into the context

The field is documented as a combination of both.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=8c283e8fe631135a0c36d50f9c8d558f43cfef7b
---

 libavutil/tx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavutil/tx.c b/libavutil/tx.c
index da8ebddd9a..aeb0d9dada 100644
--- a/libavutil/tx.c
+++ b/libavutil/tx.c
@@ -620,7 +620,7 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType 
type,
 sctx->len= len;
 sctx->inv= inv;
 sctx->type   = type;
-sctx->flags  = flags;
+sctx->flags  = cd->flags | flags;
 sctx->cd_self= cd;
 
 s->fn[s->nb_sub] = cd->function;

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] lavu/tx: rotate 3 & 15-point exptabs

2022-09-09 Thread Lynne
ffmpeg | branch: master | Lynne  | Sat Sep 10 02:31:43 2022 
+0200| [c92edd969aaf8b12434ff4bd731aa4bc5548fbbf] | committer: Lynne

lavu/tx: rotate 3 & 15-point exptabs

This just inverts their signs. Simplifies SIMD.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c92edd969aaf8b12434ff4bd731aa4bc5548fbbf
---

 libavutil/tx_template.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index 1d4c4d294b..0c7ddd26f6 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -109,11 +109,11 @@ static av_cold void TX_TAB(ff_tx_init_tab_53)(void)
 TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 12));
 TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 12));
 TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI /  6));
-TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI /  6));
+TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(8 * M_PI /  6));
 TX_TAB(ff_tx_tab_53)[4] = RESCALE(cos(2 * M_PI /  5));
-TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI /  5));
+TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(8 * M_PI /  5));
 TX_TAB(ff_tx_tab_53)[6] = RESCALE(cos(2 * M_PI / 10));
-TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
+TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(6 * M_PI /  5));
 }
 
 static av_cold void TX_TAB(ff_tx_init_tab_7)(void)
@@ -226,8 +226,8 @@ static av_always_inline void NAME(TXComplex *out, TXComplex 
*in,\
 \
 SMUL(t[4].re, t[0].re, tab[4], tab[6], t[2].re, t[0].re);   \
 SMUL(t[4].im, t[0].im, tab[4], tab[6], t[2].im, t[0].im);   \
-CMUL(t[5].re, t[1].re, tab[5], tab[7], t[3].re, t[1].re);   \
-CMUL(t[5].im, t[1].im, tab[5], tab[7], t[3].im, t[1].im);   \
+CMUL(t[5].re, t[1].re, -tab[5], -tab[7], t[3].re, t[1].re); \
+CMUL(t[5].im, t[1].im, -tab[5], -tab[7], t[3].im, t[1].im); \
 \
 BF(z0[0].re, z0[3].re, t[0].re, t[1].re);   \
 BF(z0[0].im, z0[3].im, t[0].im, t[1].im);   \

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".