[FFmpeg-devel] (no subject)

2023-10-14 Thread Logan.Lyu

checkasm bench:
put_hevc_epel_hv4_8_c: 213.7
put_hevc_epel_hv4_8_i8mm: 59.4
put_hevc_epel_hv6_8_c: 350.9
put_hevc_epel_hv6_8_i8mm: 130.2
put_hevc_epel_hv8_8_c: 548.7
put_hevc_epel_hv8_8_i8mm: 136.9
put_hevc_epel_hv12_8_c: 1126.7
put_hevc_epel_hv12_8_i8mm: 302.2
put_hevc_epel_hv16_8_c: 1925.2
put_hevc_epel_hv16_8_i8mm: 459.9
put_hevc_epel_hv24_8_c: 4301.9
put_hevc_epel_hv24_8_i8mm: 1024.9
put_hevc_epel_hv32_8_c: 7509.2
put_hevc_epel_hv32_8_i8mm: 1680.4
put_hevc_epel_hv48_8_c: 16566.9
put_hevc_epel_hv48_8_i8mm: 3945.4
put_hevc_epel_hv64_8_c: 29134.2
put_hevc_epel_hv64_8_i8mm: 6567.7

Co-Authored-By: J. Dekker 
Signed-off-by: Logan Lyu 
---
 libavcodec/aarch64/hevcdsp_epel_neon.S| 265 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 270 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S

index e541db5430..ebc16da5b6 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -1018,6 +1018,271 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, 
export=1

 ret
 endfunc
 +
+function ff_hevc_put_hevc_epel_hv4_8_neon_i8mm, export=1
+add w10, w3, #3
+lsl x10, x10, #7
+sub sp, sp, x10 // tmp_array
+stp x5, x30, [sp, #-32]!
+stp x0, x3, [sp, #16]
+add x0, sp, #32
+sub x1, x1, x2
+add w3, w3, #3
+bl  X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+ldp x5, x30, [sp]
+ldp x0, x3, [sp, #16]
+add sp, sp, #32
+load_epel_filterh x5, x4
+mov x10, #(MAX_PB_SIZE * 2)
+ldr d16, [sp]
+ldr d17, [sp, x10]
+add sp, sp, x10, lsl #1
+ld1 {v18.4h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().4h}, [sp], x10
+calc_epelh  v4, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.4h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv6_8_neon_i8mm, export=1
+add w10, w3, #3
+lsl x10, x10, #7
+sub sp, sp, x10 // tmp_array
+stp x5, x30, [sp, #-32]!
+stp x0, x3, [sp, #16]
+add x0, sp, #32
+sub x1, x1, x2
+add w3, w3, #3
+bl  X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+ldp x5, x30, [sp]
+ldp x0,  x3, [sp, #16]
+add sp, sp, #32
+load_epel_filterh x5, x4
+mov x5, #120
+mov x10, #(MAX_PB_SIZE * 2)
+ldr q16, [sp]
+ldr q17, [sp, x10]
+add sp, sp, x10, lsl #1
+ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().8h}, [sp], x10
+calc_epelh  v4, \src0, \src1, \src2, \src3
+calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+st1 {v4.d}[0], [x0], #8
+subsw3, w3, #1
+st1 {v4.s}[2], [x0], x5
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv8_8_neon_i8mm, export=1
+add w10, w3, #3
+lsl x10, x10, #7
+sub sp, sp, x10 // tmp_array
+stp x5, x30, [sp, #-32]!
+stp x0, x3, [sp, #16]
+add x0, sp, #32
+sub x1, x1, x2
+add w3, w3, #3
+bl  X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+ldp x5, x30, [sp]
+ldp x0, x3, [sp, #16]
+add sp, sp, #32
+load_epel_filterh x5, x4
+mov x10, #(MAX_PB_SIZE * 2)
+ldr q16, [sp]
+ldr q17, [sp, x10]
+add sp, sp, x10, lsl #1
+ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().8h}, [sp], x10
+calc_epelh  v4, \src0, \src1, \src2, \src3
+calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.8h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv12_8_neon_i8mm, export=1
+add w10, w3, #3
+lsl x10, x10, #7
+sub sp, sp, x10 // tmp_array
+stp x5, x30, [sp, #-32]!
+stp x0, x3, [sp, #16]
+add x0, sp, #32
+sub x1, x1, x2
+ad

[FFmpeg-devel] (no subject)

2023-10-14 Thread Logan.Lyu

checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9

Co-Authored-By: J. Dekker 
Signed-off-by: Logan Lyu 
---
 libavcodec/aarch64/hevcdsp_epel_neon.S| 223 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 228 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S

index b4ca1e4c20..e541db5430 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, 
export=1

 ret
 endfunc
 +
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr s16, [x1]
+ldr s17, [x1 ,x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().s}[0], [x1], x2
+moviv4.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.4h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2 - 8)
+ldr d16, [x1]
+ldr d17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().8b}, [x1], x2
+moviv4.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+st1 {v4.d}[0], [x0], #8
+subsw3, w3, #1
+st1 {v4.s}[2], [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr d16, [x1]
+ldr d17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().8b}, [x1], x2
+moviv4.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.8h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr q16, [x1]
+ldr q17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().16b}, [x1], x2
+moviv4.8h, #0
+moviv5.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+calc_epelb2 v5, \src0, \src1, \src2, \src3
+str q4, [x0]
+subsw3, w3, #1
+str d5, [x0, #16]
+add x0, x0, x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr q16, [x1]
+ldr q17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1{\src3\().16b}, [x1], x2
+moviv4.8h, #0
+moviv5.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+calc_epelb2 v5, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2
+ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2
+ld1

[FFmpeg-devel] (no subject)

2023-10-14 Thread Logan.Lyu

checkasm bench:
put_hevc_qpel_v4_8_c: 138.1
put_hevc_qpel_v4_8_neon: 41.1
put_hevc_qpel_v6_8_c: 276.6
put_hevc_qpel_v6_8_neon: 60.9
put_hevc_qpel_v8_8_c: 478.9
put_hevc_qpel_v8_8_neon: 72.9
put_hevc_qpel_v12_8_c: 1072.6
put_hevc_qpel_v12_8_neon: 203.9
put_hevc_qpel_v16_8_c: 1852.1
put_hevc_qpel_v16_8_neon: 264.1
put_hevc_qpel_v24_8_c: 4137.6
put_hevc_qpel_v24_8_neon: 586.9
put_hevc_qpel_v32_8_c: 7579.1
put_hevc_qpel_v32_8_neon: 1036.6
put_hevc_qpel_v48_8_c: 16355.6
put_hevc_qpel_v48_8_neon: 2326.4
put_hevc_qpel_v64_8_c: 33545.1
put_hevc_qpel_v64_8_neon: 4126.4

Co-Authored-By: J. Dekker 
Signed-off-by: Logan Lyu 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S| 347 +++---
 2 files changed, 314 insertions(+), 38 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c

index e9a341ecb9..f6b4c31d17 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -204,6 +204,10 @@ NEON8_FNPROTO(qpel_h, (int16_t *dst,
 const uint8_t *_src, ptrdiff_t _srcstride,
 int height, intptr_t mx, intptr_t my, int width), _i8mm);
 +NEON8_FNPROTO(qpel_v, (int16_t *dst,
+const uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
 const uint8_t *src, ptrdiff_t srcstride,
 int height, intptr_t mx, intptr_t my, int width),);
@@ -315,6 +319,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext 
*c, const int bit_depth)

 NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
 NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
 NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
+NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v,);
 NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
 NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
 NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S

index 4132d7a8a9..eff70d70a4 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -112,6 +112,44 @@ endconst
 .endif
 .endm
 +.macro calc_all
+calcv23, v16, v17, v18, v19, v20, v21, v22, v23
+b.eq2f
+calcv16, v17, v18, v19, v20, v21, v22, v23, v16
+b.eq2f
+calcv17, v18, v19, v20, v21, v22, v23, v16, v17
+b.eq2f
+calcv18, v19, v20, v21, v22, v23, v16, v17, v18
+b.eq2f
+calcv19, v20, v21, v22, v23, v16, v17, v18, v19
+b.eq2f
+calcv20, v21, v22, v23, v16, v17, v18, v19, v20
+b.eq2f
+calcv21, v22, v23, v16, v17, v18, v19, v20, v21
+b.eq2f
+calcv22, v23, v16, v17, v18, v19, v20, v21, v22
+b.hi1b
+.endm
+
+.macro calc_all2
+calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, 
v19, v21, v23, v25, v27, v29, v31

+b.eq2f
+calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, 
v21, v23, v25, v27, v29, v31, v17

+b.eq2f
+calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, 
v23, v25, v27, v29, v31, v17, v19

+b.eq2f
+calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, 
v25, v27, v29, v31, v17, v19, v21

+b.eq2f
+calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, 
v27, v29, v31, v17, v19, v21, v23

+b.eq2f
+calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, 
v29, v31, v17, v19, v21, v23, v25

+b.eq2f
+calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, 
v31, v17, v19, v21, v23, v25, v27

+b.eq2f
+calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, 
v17, v19, v21, v23, v25, v27, v29

+b.hi1b
+.endm
+
 .macro put_hevc type
 .ifc \type, qpel
 // void put_hevc_qpel_h(int16_t *dst,
@@ -558,6 +596,277 @@ put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
 +function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+load_qpel_filterb x5, x4
+sub x1, x1, x2, lsl #1
+mov x9, #(MAX_PB_SIZE * 2)
+sub x1, x1, x2
+ldr s16, [x1]
+ldr s17, [x1, x2]
+add x1, x1, x2, lsl #1
+ldr s18, [x1]
+ldr s19, [x1, x2]
+add x1, x1, x2, lsl #1
+ldr s20, [x1]
+ldr s21, [x1, x2]
+add x1, x1, x2, lsl #1
+ldr

[FFmpeg-devel] (no subject)

2023-10-14 Thread Logan.Lyu

checkasm bench:
put_hevc_qpel_hv4_8_c: 422.1
put_hevc_qpel_hv4_8_i8mm: 101.6
put_hevc_qpel_hv6_8_c: 756.4
put_hevc_qpel_hv6_8_i8mm: 225.9
put_hevc_qpel_hv8_8_c: 1189.9
put_hevc_qpel_hv8_8_i8mm: 296.6
put_hevc_qpel_hv12_8_c: 2407.4
put_hevc_qpel_hv12_8_i8mm: 552.4
put_hevc_qpel_hv16_8_c: 4021.4
put_hevc_qpel_hv16_8_i8mm: 886.6
put_hevc_qpel_hv24_8_c: 8992.1
put_hevc_qpel_hv24_8_i8mm: 1968.9
put_hevc_qpel_hv32_8_c: 15197.9
put_hevc_qpel_hv32_8_i8mm: 3209.4
put_hevc_qpel_hv48_8_c: 32811.1
put_hevc_qpel_hv48_8_i8mm: 7442.1
put_hevc_qpel_hv64_8_c: 58106.1
put_hevc_qpel_hv64_8_i8mm: 12423.9

Co-Authored-By: J. Dekker 
Signed-off-by: Logan Lyu 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S| 397 ++
 2 files changed, 402 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c

index f6b4c31d17..7d889efe68 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -208,6 +208,10 @@ NEON8_FNPROTO(qpel_v, (int16_t *dst,
 const uint8_t *src, ptrdiff_t srcstride,
 int height, intptr_t mx, intptr_t my, int width),);
 +NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+const uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
 const uint8_t *src, ptrdiff_t srcstride,
 int height, intptr_t mx, intptr_t my, int width),);
@@ -335,6 +339,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext 
*c, const int bit_depth)
 NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, 
_i8mm);
 NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h 
,_i8mm);

 NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv, _i8mm);
 NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv, 
_i8mm);
 NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, 
_i8mm);
 NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, 
epel_uni_w_hv, _i8mm);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S

index eff70d70a4..e4475ba920 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -3070,6 +3070,403 @@ function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, 
export=1

 ret
 endfunc
 +
+function ff_hevc_put_hevc_qpel_hv4_8_neon_i8mm, export=1
+add w10, w3, #7
+mov x7, #128
+lsl x10, x10, #7
+sub sp, sp, x10 // tmp_array
+stp x5, x30, [sp, #-32]!
+stp x0, x3, [sp, #16]
+add x0, sp, #32
+sub x1, x1, x2, lsl #1
+add x3, x3, #7
+sub x1, x1, x2
+bl  X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm)
+ldp x5, x30, [sp]
+ldp x0, x3, [sp, #16]
+add sp, sp, #32
+load_qpel_filterh x5, x4
+ldr d16, [sp]
+ldr d17, [sp, x7]
+add sp, sp, x7, lsl #1
+ldr d18, [sp]
+ldr d19, [sp, x7]
+add sp, sp, x7, lsl #1
+ldr d20, [sp]
+ldr d21, [sp, x7]
+add sp, sp, x7, lsl #1
+ldr d22, [sp]
+add sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ld1 {\tmp\().4h}, [sp], x7
+calc_qpelh  v1, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7, sqshrn

+subsw3, w3, #1
+st1 {v1.4h}, [x0], x7
+.endm
+1:  calc_all
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon_i8mm, export=1
+add w10, w3, #7
+mov x7, #128
+lsl x10, x10, #7
+sub sp, sp, x10 // tmp_array
+stp x5, x30, [sp, #-32]!
+stp x0, x3, [sp, #16]
+add x0, sp, #32
+sub x1, x1, x2, lsl #1
+add x3, x3, #7
+sub x1, x1, x2
+bl  X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm)
+ldp x5, x30, [sp]
+mov x8, #120
+ldp x0, x3, [sp, #16]
+add sp, sp, #32
+load_qpel_filterh x5, x4
+ldr q16, [sp]
+ldr q17, [sp, x7]
+add sp, sp, x7, lsl #1
+ldr q18, [sp]
+ldr q19, [sp, x7]
+add sp, sp, x7, lsl #1
+ldr q20, [sp]
+ldr q21, [sp, x7

[FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v

2023-10-14 Thread Logan.Lyu

checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9

Co-Authored-By: J. Dekker 
Signed-off-by: Logan Lyu 
---
 libavcodec/aarch64/hevcdsp_epel_neon.S| 223 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 228 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S

index b4ca1e4c20..e541db5430 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, 
export=1

 ret
 endfunc
 +
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr s16, [x1]
+ldr s17, [x1 ,x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().s}[0], [x1], x2
+moviv4.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.4h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2 - 8)
+ldr d16, [x1]
+ldr d17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().8b}, [x1], x2
+moviv4.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+st1 {v4.d}[0], [x0], #8
+subsw3, w3, #1
+st1 {v4.s}[2], [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr d16, [x1]
+ldr d17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().8b}, [x1], x2
+moviv4.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.8h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr q16, [x1]
+ldr q17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().16b}, [x1], x2
+moviv4.8h, #0
+moviv5.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+calc_epelb2 v5, \src0, \src1, \src2, \src3
+str q4, [x0]
+subsw3, w3, #1
+str d5, [x0, #16]
+add x0, x0, x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr q16, [x1]
+ldr q17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1{\src3\().16b}, [x1], x2
+moviv4.8h, #0
+moviv5.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+calc_epelb2 v5, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2
+ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2
+ld1

[FFmpeg-devel] [PATCH 2/4] lavc/aarch64: new optimization for 8-bit hevc_epel_hv

2023-10-14 Thread Logan.Lyu

checkasm bench:
put_hevc_epel_hv4_8_c: 213.7
put_hevc_epel_hv4_8_i8mm: 59.4
put_hevc_epel_hv6_8_c: 350.9
put_hevc_epel_hv6_8_i8mm: 130.2
put_hevc_epel_hv8_8_c: 548.7
put_hevc_epel_hv8_8_i8mm: 136.9
put_hevc_epel_hv12_8_c: 1126.7
put_hevc_epel_hv12_8_i8mm: 302.2
put_hevc_epel_hv16_8_c: 1925.2
put_hevc_epel_hv16_8_i8mm: 459.9
put_hevc_epel_hv24_8_c: 4301.9
put_hevc_epel_hv24_8_i8mm: 1024.9
put_hevc_epel_hv32_8_c: 7509.2
put_hevc_epel_hv32_8_i8mm: 1680.4
put_hevc_epel_hv48_8_c: 16566.9
put_hevc_epel_hv48_8_i8mm: 3945.4
put_hevc_epel_hv64_8_c: 29134.2
put_hevc_epel_hv64_8_i8mm: 6567.7

Co-Authored-By: J. Dekker 
Signed-off-by: Logan Lyu 
---
 libavcodec/aarch64/hevcdsp_epel_neon.S| 265 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 270 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S

index e541db5430..ebc16da5b6 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -1018,6 +1018,271 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, 
export=1

 ret
 endfunc
 +
+function ff_hevc_put_hevc_epel_hv4_8_neon_i8mm, export=1
+add w10, w3, #3
+lsl x10, x10, #7
+sub sp, sp, x10 // tmp_array
+stp x5, x30, [sp, #-32]!
+stp x0, x3, [sp, #16]
+add x0, sp, #32
+sub x1, x1, x2
+add w3, w3, #3
+bl  X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+ldp x5, x30, [sp]
+ldp x0, x3, [sp, #16]
+add sp, sp, #32
+load_epel_filterh x5, x4
+mov x10, #(MAX_PB_SIZE * 2)
+ldr d16, [sp]
+ldr d17, [sp, x10]
+add sp, sp, x10, lsl #1
+ld1 {v18.4h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().4h}, [sp], x10
+calc_epelh  v4, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.4h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv6_8_neon_i8mm, export=1
+add w10, w3, #3
+lsl x10, x10, #7
+sub sp, sp, x10 // tmp_array
+stp x5, x30, [sp, #-32]!
+stp x0, x3, [sp, #16]
+add x0, sp, #32
+sub x1, x1, x2
+add w3, w3, #3
+bl  X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+ldp x5, x30, [sp]
+ldp x0,  x3, [sp, #16]
+add sp, sp, #32
+load_epel_filterh x5, x4
+mov x5, #120
+mov x10, #(MAX_PB_SIZE * 2)
+ldr q16, [sp]
+ldr q17, [sp, x10]
+add sp, sp, x10, lsl #1
+ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().8h}, [sp], x10
+calc_epelh  v4, \src0, \src1, \src2, \src3
+calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+st1 {v4.d}[0], [x0], #8
+subsw3, w3, #1
+st1 {v4.s}[2], [x0], x5
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv8_8_neon_i8mm, export=1
+add w10, w3, #3
+lsl x10, x10, #7
+sub sp, sp, x10 // tmp_array
+stp x5, x30, [sp, #-32]!
+stp x0, x3, [sp, #16]
+add x0, sp, #32
+sub x1, x1, x2
+add w3, w3, #3
+bl  X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+ldp x5, x30, [sp]
+ldp x0, x3, [sp, #16]
+add sp, sp, #32
+load_epel_filterh x5, x4
+mov x10, #(MAX_PB_SIZE * 2)
+ldr q16, [sp]
+ldr q17, [sp, x10]
+add sp, sp, x10, lsl #1
+ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().8h}, [sp], x10
+calc_epelh  v4, \src0, \src1, \src2, \src3
+calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.8h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv12_8_neon_i8mm, export=1
+add w10, w3, #3
+lsl x10, x10, #7
+sub sp, sp, x10 // tmp_array
+stp x5, x30, [sp, #-32]!
+stp x0, x3, [sp, #16]
+add x0, sp, #32
+sub x1, x1, x2
+ad

[FFmpeg-devel] [PATCH 3/4] lavc/aarch64: new optimization for 8-bit hevc_qpel_v

2023-10-14 Thread Logan.Lyu

checkasm bench:
put_hevc_qpel_v4_8_c: 138.1
put_hevc_qpel_v4_8_neon: 41.1
put_hevc_qpel_v6_8_c: 276.6
put_hevc_qpel_v6_8_neon: 60.9
put_hevc_qpel_v8_8_c: 478.9
put_hevc_qpel_v8_8_neon: 72.9
put_hevc_qpel_v12_8_c: 1072.6
put_hevc_qpel_v12_8_neon: 203.9
put_hevc_qpel_v16_8_c: 1852.1
put_hevc_qpel_v16_8_neon: 264.1
put_hevc_qpel_v24_8_c: 4137.6
put_hevc_qpel_v24_8_neon: 586.9
put_hevc_qpel_v32_8_c: 7579.1
put_hevc_qpel_v32_8_neon: 1036.6
put_hevc_qpel_v48_8_c: 16355.6
put_hevc_qpel_v48_8_neon: 2326.4
put_hevc_qpel_v64_8_c: 33545.1
put_hevc_qpel_v64_8_neon: 4126.4

Co-Authored-By: J. Dekker 
Signed-off-by: Logan Lyu 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S| 347 +++---
 2 files changed, 314 insertions(+), 38 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c

index e9a341ecb9..f6b4c31d17 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -204,6 +204,10 @@ NEON8_FNPROTO(qpel_h, (int16_t *dst,
 const uint8_t *_src, ptrdiff_t _srcstride,
 int height, intptr_t mx, intptr_t my, int width), _i8mm);
 +NEON8_FNPROTO(qpel_v, (int16_t *dst,
+const uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
 const uint8_t *src, ptrdiff_t srcstride,
 int height, intptr_t mx, intptr_t my, int width),);
@@ -315,6 +319,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext 
*c, const int bit_depth)

 NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
 NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
 NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
+NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v,);
 NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
 NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
 NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S

index 4132d7a8a9..eff70d70a4 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -112,6 +112,44 @@ endconst
 .endif
 .endm
 +.macro calc_all
+calcv23, v16, v17, v18, v19, v20, v21, v22, v23
+b.eq2f
+calcv16, v17, v18, v19, v20, v21, v22, v23, v16
+b.eq2f
+calcv17, v18, v19, v20, v21, v22, v23, v16, v17
+b.eq2f
+calcv18, v19, v20, v21, v22, v23, v16, v17, v18
+b.eq2f
+calcv19, v20, v21, v22, v23, v16, v17, v18, v19
+b.eq2f
+calcv20, v21, v22, v23, v16, v17, v18, v19, v20
+b.eq2f
+calcv21, v22, v23, v16, v17, v18, v19, v20, v21
+b.eq2f
+calcv22, v23, v16, v17, v18, v19, v20, v21, v22
+b.hi1b
+.endm
+
+.macro calc_all2
+calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, 
v19, v21, v23, v25, v27, v29, v31

+b.eq2f
+calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, 
v21, v23, v25, v27, v29, v31, v17

+b.eq2f
+calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, 
v23, v25, v27, v29, v31, v17, v19

+b.eq2f
+calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, 
v25, v27, v29, v31, v17, v19, v21

+b.eq2f
+calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, 
v27, v29, v31, v17, v19, v21, v23

+b.eq2f
+calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, 
v29, v31, v17, v19, v21, v23, v25

+b.eq2f
+calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, 
v31, v17, v19, v21, v23, v25, v27

+b.eq2f
+calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, 
v17, v19, v21, v23, v25, v27, v29

+b.hi1b
+.endm
+
 .macro put_hevc type
 .ifc \type, qpel
 // void put_hevc_qpel_h(int16_t *dst,
@@ -558,6 +596,277 @@ put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
 +function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+load_qpel_filterb x5, x4
+sub x1, x1, x2, lsl #1
+mov x9, #(MAX_PB_SIZE * 2)
+sub x1, x1, x2
+ldr s16, [x1]
+ldr s17, [x1, x2]
+add x1, x1, x2, lsl #1
+ldr s18, [x1]
+ldr s19, [x1, x2]
+add x1, x1, x2, lsl #1
+ldr s20, [x1]
+ldr s21, [x1, x2]
+add x1, x1, x2, lsl #1
+ldr

[FFmpeg-devel] [PATCH 4/4] lavc/aarch64: new optimization for 8-bit hevc_qpel_hv

2023-10-14 Thread Logan.Lyu

checkasm bench:
put_hevc_qpel_hv4_8_c: 422.1
put_hevc_qpel_hv4_8_i8mm: 101.6
put_hevc_qpel_hv6_8_c: 756.4
put_hevc_qpel_hv6_8_i8mm: 225.9
put_hevc_qpel_hv8_8_c: 1189.9
put_hevc_qpel_hv8_8_i8mm: 296.6
put_hevc_qpel_hv12_8_c: 2407.4
put_hevc_qpel_hv12_8_i8mm: 552.4
put_hevc_qpel_hv16_8_c: 4021.4
put_hevc_qpel_hv16_8_i8mm: 886.6
put_hevc_qpel_hv24_8_c: 8992.1
put_hevc_qpel_hv24_8_i8mm: 1968.9
put_hevc_qpel_hv32_8_c: 15197.9
put_hevc_qpel_hv32_8_i8mm: 3209.4
put_hevc_qpel_hv48_8_c: 32811.1
put_hevc_qpel_hv48_8_i8mm: 7442.1
put_hevc_qpel_hv64_8_c: 58106.1
put_hevc_qpel_hv64_8_i8mm: 12423.9

Co-Authored-By: J. Dekker 
Signed-off-by: Logan Lyu 
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S| 397 ++
 2 files changed, 402 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c

index f6b4c31d17..7d889efe68 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -208,6 +208,10 @@ NEON8_FNPROTO(qpel_v, (int16_t *dst,
 const uint8_t *src, ptrdiff_t srcstride,
 int height, intptr_t mx, intptr_t my, int width),);
 +NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+const uint8_t *src, ptrdiff_t srcstride,
+int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
 const uint8_t *src, ptrdiff_t srcstride,
 int height, intptr_t mx, intptr_t my, int width),);
@@ -335,6 +339,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext 
*c, const int bit_depth)
 NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, 
_i8mm);
 NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h 
,_i8mm);

 NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv, _i8mm);
 NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv, 
_i8mm);
 NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, 
_i8mm);
 NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, 
epel_uni_w_hv, _i8mm);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S

index eff70d70a4..e4475ba920 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -3070,6 +3070,403 @@ function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, 
export=1

 ret
 endfunc
 +
+function ff_hevc_put_hevc_qpel_hv4_8_neon_i8mm, export=1
+add w10, w3, #7
+mov x7, #128
+lsl x10, x10, #7
+sub sp, sp, x10 // tmp_array
+stp x5, x30, [sp, #-32]!
+stp x0, x3, [sp, #16]
+add x0, sp, #32
+sub x1, x1, x2, lsl #1
+add x3, x3, #7
+sub x1, x1, x2
+bl  X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm)
+ldp x5, x30, [sp]
+ldp x0, x3, [sp, #16]
+add sp, sp, #32
+load_qpel_filterh x5, x4
+ldr d16, [sp]
+ldr d17, [sp, x7]
+add sp, sp, x7, lsl #1
+ldr d18, [sp]
+ldr d19, [sp, x7]
+add sp, sp, x7, lsl #1
+ldr d20, [sp]
+ldr d21, [sp, x7]
+add sp, sp, x7, lsl #1
+ldr d22, [sp]
+add sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ld1 {\tmp\().4h}, [sp], x7
+calc_qpelh  v1, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7, sqshrn

+subsw3, w3, #1
+st1 {v1.4h}, [x0], x7
+.endm
+1:  calc_all
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon_i8mm, export=1
+add w10, w3, #7
+mov x7, #128
+lsl x10, x10, #7
+sub sp, sp, x10 // tmp_array
+stp x5, x30, [sp, #-32]!
+stp x0, x3, [sp, #16]
+add x0, sp, #32
+sub x1, x1, x2, lsl #1
+add x3, x3, #7
+sub x1, x1, x2
+bl  X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm)
+ldp x5, x30, [sp]
+mov x8, #120
+ldp x0, x3, [sp, #16]
+add sp, sp, #32
+load_qpel_filterh x5, x4
+ldr q16, [sp]
+ldr q17, [sp, x7]
+add sp, sp, x7, lsl #1
+ldr q18, [sp]
+ldr q19, [sp, x7]
+add sp, sp, x7, lsl #1
+ldr q20, [sp]
+ldr q21, [sp, x7

Re: [FFmpeg-devel] [PATCH v2 02/13] avcodec: add extended AVCodec color metadata

2023-10-14 Thread Niklas Haas
On Fri, 13 Oct 2023 19:10:33 +0200 Andreas Rheinhardt 
 wrote:
> This design has several drawbacks:
> 1. It adds stuff that will only be set by a tiny minority of AVCodec's
> to all of them.
> 2. It is based around the underlying assumption that the set of
> permissible states (tupels) is a cartesian product of a set of color
> spaces, a set of color ranges etc. This is wrong: E.g. VP9 disallows
> limited-range RGB (it is syntactically impossible to set the color range
> when using RGB color space).
> 3. I don't see how the MJPEG encoder behaviour where the valid formats
> de facto depend upon strictness can be encoded in this way; isn't the
> aim to get rid of the necessity of the workaround in ffmpeg cli?
> 
> 1. and 2. suggests using some form of function that returns a list of
> supported tupels; if said function uses an AVCodecContext* parameter,
> said list can depend upon the state of the AVCodecContext given to it,
> thereby solving 3. to the extent that one can get the supported
> combinations given AVCodecContext options (but I do not see a good way
> to signal which options modify the supported combinations).

There are two other designs I can think of:

1. Enumerate all possible combinations as a list. To avoid combinatoric
   explosion, setting any field to 'UNSPECIFIED' implies no restriction
   on that field. So the default (no list) would be equivalent to a list
   with a single entry containing values of UNSPECIFIED for every entry.

2. Provide a single function which merely checks if a given combination
   is supported or not.

#2 would work for the short term but runs into the same risk of
exponential explosion if we need to start finding a common format
between different filters.

So maybe #1 is the correct approach here. It would also simplify
extending the filter API, as we would only need one set of list
managing/merging/compat testing boilerplate for all of the colorspace
metadata.

I will try implementing #1 on a separate branch.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2 02/13] avcodec: add extended AVCodec color metadata

2023-10-14 Thread Niklas Haas
On Fri, 13 Oct 2023 19:10:33 +0200 Andreas Rheinhardt 
 wrote:
> 2. It is based around the underlying assumption that the set of
> permissible states (tupels) is a cartesian product of a set of color
> spaces, a set of color ranges etc. This is wrong: E.g. VP9 disallows
> limited-range RGB (it is syntactically impossible to set the color range
> when using RGB color space).

Well, upon further consideration, I don't think this is enough to break
the cartesian approach, because RGB is always full range by convention.
Note how vf_scale, vf_zscale and vf_libplacebo all force the color range
for RGB inputs to full. So this is not an exception, rather it is the
rule. In other words, for RGB input, the colorspace and color_range
restrictions should simply be ignored, as they conceptually apply to YUV
formats only.

Note also that, thinking a little bit ahead, independent list would make
AVFilter negotiation *much* easier as we could just re-use
AVFilterFormats for each field without worry - whereas a "list of
tuples" approach requires introducing a new struct to group such
metadata, a new type of AVFilterFormats list + all supporting functions,
and a lot more boilerplate overall.

So we need to think very carefully if there actually are any
sufficiently strong motivating cases to introduce such heavy machinery.

> 3. I don't see how the MJPEG encoder behaviour where the valid formats
> de facto depend upon strictness can be encoded in this way; isn't the
> aim to get rid of the necessity of the workaround in ffmpeg cli?

Note that ffmpeg cli presently initializes the filter graph well before
the AVCodecContext is set up with options, let alone opened. (Presently,
the logic for overriding the pixfmt list directly looks up the "strict"
field in the options dict)

So that limits the design space somewhat for elegant solutions here.
Either we make the "return list of supported formats" callback in
AVCodec simply accept the strict_std_compliance setting directly, or we
extend the static list of colorspaces itself by an extra strictness
field. Probably the former is better than the latter of these two
approaches.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2 00/13] YUVJ removal

2023-10-14 Thread Lynne
Oct 14, 2023, 00:22 by vittorio.giov...@gmail.com:

> On Fri, Oct 13, 2023 at 5:14 PM Lynne  wrote:
>
>> Oct 13, 2023, 20:33 by vittorio.giov...@gmail.com:
>>
>> > On Fri, Oct 13, 2023 at 10:27 AM Niklas Haas  wrote:
>> >
>> >> Changes since v1:
>> >>
>> >> - Remove unneeded patch (AVCodecContext.colorspace init)
>> >> - Merge auto-range conversion into auto-scale filter
>> >> - Replace vf_zscale by vf_colorspace in fftools
>> >>
>> >
>> > Why is this? I haven't checked what vf_colorspace supports in a hot
>> second,
>> > but iirc zscale can handle non linear spaces better and hdr conversion
>> > If it's because it's a built in filter, do you think we could first check
>> > for zscale presence and fallback to colorspace?
>> >
>>
>> vf_colorspace != swscale
>>
>
> I am aware, thanks, not sure why's related here
>
>
>> Relying on external library for basic functionality that we have
>> no control over, which may break its ABI or API at any moment,
>> when we have a built-in one is a no.
>> I wouldn't agree to having it optional in this case either. Users
>> can explicitly request it as a filter and use it, which fits in better
>> with its very explicit programming model too.
>>
>
> except colorspace doesn't implement necessary features and conversions that
> are present in zscale afair
> if it's an automation to facilitate the life of a user it shouldn't come at
> the cost of producing actual good results
>

colorspace doesn't make it impossible to introduce all that is needed.
It's a cleaner codebase that we can extend.
As for HDR, I think anything but what libplacebo does is sufficient.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2 02/13] avcodec: add extended AVCodec color metadata

2023-10-14 Thread Niklas Haas
On Sat, 14 Oct 2023 09:31:32 -0400 Leo Izen  wrote:
> On 10/13/23 10:24, Niklas Haas wrote:
> > From: Niklas Haas 
> > 
> > This is motivated primarily by a desire for YUVJ removal, which will
> > require signalling the supported color ranges as part of the codec
> > capabilities. But since we're here anyway, we might as well add all of
> > the metadata, which I foresee seeing more use in the future (e.g.
> > automatic conversion from HDR to SDR when encoding to formats that don't
> > support AVCOL_TRC_SMPTE2084, ...)
> > ---
> > +
> > +/* Extended colorspace support metadata */
> > +const enum AVColorSpace *csps;  ///< array of 
> > supported color spaces, or NULL if unknown, array is terminated by 
> > AVCOL_SPC_UNSPECIFIED
> > +const enum AVColorRange *color_ranges;  ///< array of 
> > supported color ranges, or NULL if unknown, array is terminated by 0
> > +const enum AVChromaLocation *chroma_locs;   ///< array of 
> > supported chroma locations, or NULL if unknown, array is terminated by 0
> > +const enum AVColorPrimaries *primaries; ///< array of 
> > supported color primaries, or NULL if unknown, array is terminated by 0
> > +const enum AVColorTransferCharacteristic *trcs; ///< array of 
> > supported transfer characteristics, or NULL if known, array is terminated 
> > by 0
> >   } AVCodec;
> 
> 
> Any particular reason we're using AVCOL_SPC_UNSPECIFIED to terminate 
> csps, but not using AVCOL_PRI_UNSPECIFIED for the primaries and the 
> equivalent for the TRC? It seems a bit more consistent than using RESERVED0

To be clear, we are - AVCOL_PRI/TRC_UNSPECIFIED are equal to 0, unlike
the other fields. But I could change the comments here for clarity.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] SWS cleanup / SPI Funding Suggestion

2023-10-14 Thread Kieran Kunhya
On Sat, 14 Oct 2023 at 00:17, Cosmin Stejerean via ffmpeg-devel <
ffmpeg-devel@ffmpeg.org> wrote:

>
>
> > On Oct 13, 2023, at 4:00 PM, Vittorio Giovara <
> vittorio.giov...@gmail.com> wrote:
> >
> > TBF this is in part why i was suggesting a new library - I feel like sws
> is
> > affected by bad brading because of these caching issues and imprecise
> > conversion, and a new clean api in a new library would make a lot of
> sense
> > in my opinion.
>
> I think the branding issue would solve itself in short order if the actual
> implementation of swscale started to be good. My concern with adding a new
> library is that we'd end up in a situation where we have both swscale and a
> new library side by side for some extended period of time.
>
> By comparison adding cleaner APIs to swscale and then slowly strangling
> the old APIs (along the lines of Niklas' proposal) would allow for a more
> gradual transition that has a higher likelihood of success compared to a
> full rewrite IMO.
>

The issue is not the API, the issue is that swscale is astonishingly
complex and difficult to understand internally, there are lots of different
codepaths and randomly you'll end up with a buggy or slow one and have no
idea how to fix it.

It's probably easier to start from scratch than to try and understand and
then fix swscale (years of work).

Regards,
Kieran Kunhya
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2 00/13] YUVJ removal

2023-10-14 Thread Vittorio Giovara
On Sat, Oct 14, 2023 at 9:11 AM Lynne  wrote:

> Oct 14, 2023, 00:22 by vittorio.giov...@gmail.com:
>
> > On Fri, Oct 13, 2023 at 5:14 PM Lynne  wrote:
> >
> >> Oct 13, 2023, 20:33 by vittorio.giov...@gmail.com:
> >>
> >> > On Fri, Oct 13, 2023 at 10:27 AM Niklas Haas 
> wrote:
> >> >
> >> >> Changes since v1:
> >> >>
> >> >> - Remove unneeded patch (AVCodecContext.colorspace init)
> >> >> - Merge auto-range conversion into auto-scale filter
> >> >> - Replace vf_zscale by vf_colorspace in fftools
> >> >>
> >> >
> >> > Why is this? I haven't checked what vf_colorspace supports in a hot
> >> second,
> >> > but iirc zscale can handle non linear spaces better and hdr conversion
> >> > If it's because it's a built in filter, do you think we could first
> check
> >> > for zscale presence and fallback to colorspace?
> >> >
> >>
> >> vf_colorspace != swscale
> >>
> >
> > I am aware, thanks, not sure why's related here
> >
> >
> >> Relying on external library for basic functionality that we have
> >> no control over, which may break its ABI or API at any moment,
> >> when we have a built-in one is a no.
> >> I wouldn't agree to having it optional in this case either. Users
> >> can explicitly request it as a filter and use it, which fits in better
> >> with its very explicit programming model too.
> >>
> >
> > except colorspace doesn't implement necessary features and conversions
> that
> > are present in zscale afair
> > if it's an automation to facilitate the life of a user it shouldn't come
> at
> > the cost of producing actual good results
> >
>
> colorspace doesn't make it impossible to introduce all that is needed.
> It's a cleaner codebase that we can extend.
>

* that only works on a subset of colorspaces.
Last time I checked, it would have required a massive lift to support
anything with constant luminance or the icpct formats.

As for HDR, I think anything but what libplacebo does is sufficient.
>

Right but it's also important to avoid reinventing the wheel. We could find
better solutions like having a library with different backends and bundle
them as needed.
-- 
Vittorio
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2 00/13] YUVJ removal

2023-10-14 Thread Lynne
Oct 14, 2023, 17:16 by vittorio.giov...@gmail.com:

> On Sat, Oct 14, 2023 at 9:11 AM Lynne  wrote:
>
>> Oct 14, 2023, 00:22 by vittorio.giov...@gmail.com:
>>
>> > On Fri, Oct 13, 2023 at 5:14 PM Lynne  wrote:
>> >
>> >> Oct 13, 2023, 20:33 by vittorio.giov...@gmail.com:
>> >>
>> >> > On Fri, Oct 13, 2023 at 10:27 AM Niklas Haas 
>> wrote:
>> >> >
>> >> >> Changes since v1:
>> >> >>
>> >> >> - Remove unneeded patch (AVCodecContext.colorspace init)
>> >> >> - Merge auto-range conversion into auto-scale filter
>> >> >> - Replace vf_zscale by vf_colorspace in fftools
>> >> >>
>> >> >
>> >> > Why is this? I haven't checked what vf_colorspace supports in a hot
>> >> second,
>> >> > but iirc zscale can handle non linear spaces better and hdr conversion
>> >> > If it's because it's a built in filter, do you think we could first
>> check
>> >> > for zscale presence and fallback to colorspace?
>> >> >
>> >>
>> >> vf_colorspace != swscale
>> >>
>> >
>> > I am aware, thanks, not sure why's related here
>> >
>> >
>> >> Relying on external library for basic functionality that we have
>> >> no control over, which may break its ABI or API at any moment,
>> >> when we have a built-in one is a no.
>> >> I wouldn't agree to having it optional in this case either. Users
>> >> can explicitly request it as a filter and use it, which fits in better
>> >> with its very explicit programming model too.
>> >>
>> >
>> > except colorspace doesn't implement necessary features and conversions
>> that
>> > are present in zscale afair
>> > if it's an automation to facilitate the life of a user it shouldn't come
>> at
>> > the cost of producing actual good results
>> >
>>
>> colorspace doesn't make it impossible to introduce all that is needed.
>> It's a cleaner codebase that we can extend.
>>
>
> * that only works on a subset of colorspaces.
> Last time I checked, it would have required a massive lift to support
> anything with constant luminance or the icpct formats.
>
> As for HDR, I think anything but what libplacebo does is sufficient.
>
>>
>>
>
> Right but it's also important to avoid reinventing the wheel. We could find
> better solutions like having a library with different backends and bundle
> them as needed.
>

But we also need determinism. Tests have to pass, and bitexactness
is often required by users.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] SWS cleanup / SPI Funding Suggestion

2023-10-14 Thread Michael Niedermayer
Hi

On Fri, Oct 13, 2023 at 11:16:50PM +, Cosmin Stejerean via ffmpeg-devel 
wrote:
> 
> 
> > On Oct 13, 2023, at 4:00 PM, Vittorio Giovara  
> > wrote:
> > 
> > TBF this is in part why i was suggesting a new library - I feel like sws is
> > affected by bad brading because of these caching issues and imprecise
> > conversion, and a new clean api in a new library would make a lot of sense
> > in my opinion.
> 
> I think the branding issue would solve itself in short order if the actual 
> implementation of swscale started to be good. My concern with adding a new 
> library is that we'd end up in a situation where we have both swscale and a 
> new library side by side for some extended period of time. 
> 
> By comparison adding cleaner APIs to swscale and then slowly strangling the 
> old APIs (along the lines of Niklas' proposal) would allow for a more gradual 
> transition that has a higher likelihood of success compared to a full rewrite 
> IMO.

+1

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Whats the most studid thing your enemy could do ? Blow himself up
Whats the most studid thing you could do ? Give up your rights and
freedom because your enemy blew himself up.



signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] libavcodec/dxva2.c: fix dxva2 does not support H264 baseline profile

2023-10-14 Thread Benjamin Cheng via ffmpeg-devel
On Fri Oct 13, 2023 at 11:59 PM EDT, xyz1001 wrote:
> dxva2 fail to init when decode h264 with baseline profile becase 
> `prof_h264_high` does not contains `AV_PROFILE_H264_BASELINE` and 
> `dxva_check_codec_compatibility` will return error

prof_h264_high uses either DXVA2_ModeH264_E or DXVA2_ModeH264_F, which
only supports up to H.264 High, and H.264 Baseline has features that are
not in High. You have to use a different DXVA profile for Baseline.

> ---
>  libavcodec/dxva2.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/dxva2.c b/libavcodec/dxva2.c
> index d7bc587562..e6b83f89cc 100644
> --- a/libavcodec/dxva2.c
> +++ b/libavcodec/dxva2.c
> @@ -61,7 +61,8 @@ typedef struct dxva_mode {
>  static const int prof_mpeg2_main[]   = {AV_PROFILE_MPEG2_SIMPLE,
>  AV_PROFILE_MPEG2_MAIN,
>  AV_PROFILE_UNKNOWN};
> -static const int prof_h264_high[]= {AV_PROFILE_H264_CONSTRAINED_BASELINE,
> +static const int prof_h264_high[]= {AV_PROFILE_H264_BASELINE,
> +AV_PROFILE_H264_CONSTRAINED_BASELINE,
>  AV_PROFILE_H264_MAIN,
>  AV_PROFILE_H264_HIGH,
>  AV_PROFILE_UNKNOWN};

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] configure: disable libglslang/libshaderc if the vulkan is disabled

2023-10-14 Thread Lynne
Fixes build failures when the Vulkan headers are too old and libglslang
or libshaderc are enabled.

Patch attached.

>From b47d5d3759f9c3de15846ae0fa184f1fa93f2c9a Mon Sep 17 00:00:00 2001
From: Lynne 
Date: Sat, 14 Oct 2023 18:36:46 +0200
Subject: [PATCH] configure: disable libglslang/libshaderc if the vulkan is
 disabled

Fixes build failures when the Vulkan headers are too old and libglslang
or libshaderc are enabled.
---
 configure | 4 
 1 file changed, 4 insertions(+)

diff --git a/configure b/configure
index fc7263bf59..582525db30 100755
--- a/configure
+++ b/configure
@@ -7155,6 +7155,10 @@ if enabled vulkan; then
 check_cpp_condition vulkan "vulkan/vulkan.h" "defined(VK_VERSION_1_4) || (defined(VK_VERSION_1_3) && VK_HEADER_VERSION >= 255)"
 fi
 
+if disabled vulkan; then
+disable libglslang libshaderc spirv_compiler
+fi
+
 if enabled x86; then
 case $target_os in
 mingw32*|mingw64*|win32|win64|linux|cygwin*)
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] SWS cleanup / SPI Funding Suggestion

2023-10-14 Thread Michael Niedermayer
On Sat, Oct 14, 2023 at 03:19:49PM +0100, Kieran Kunhya wrote:
> On Sat, 14 Oct 2023 at 00:17, Cosmin Stejerean via ffmpeg-devel <
> ffmpeg-devel@ffmpeg.org> wrote:
> 
> >
> >
> > > On Oct 13, 2023, at 4:00 PM, Vittorio Giovara <
> > vittorio.giov...@gmail.com> wrote:
> > >
> > > TBF this is in part why i was suggesting a new library - I feel like sws
> > is
> > > affected by bad brading because of these caching issues and imprecise
> > > conversion, and a new clean api in a new library would make a lot of
> > sense
> > > in my opinion.
> >
> > I think the branding issue would solve itself in short order if the actual
> > implementation of swscale started to be good. My concern with adding a new
> > library is that we'd end up in a situation where we have both swscale and a
> > new library side by side for some extended period of time.
> >
> > By comparison adding cleaner APIs to swscale and then slowly strangling
> > the old APIs (along the lines of Niklas' proposal) would allow for a more
> > gradual transition that has a higher likelihood of success compared to a
> > full rewrite IMO.
> >
> 
> The issue is not the API, the issue is that swscale is astonishingly
> complex and difficult to understand internally, there are lots of different
> codepaths

> and randomly you'll end up with a buggy or slow one

randomly ?
code in general doesnt give you randomly something very different.

So, why do i complain? because swscale has real issues and needs
to be improved. And these comments point in the wrong direction


> and have no
> idea how to fix it.

If you dont know how to fix it yourself, sending me a bug report is
probably a good start.


> 
> It's probably easier to start from scratch than to try and understand and
> then fix swscale (years of work).

Well there are 2 further aspects with that.

The first one is bluntly put. If you dont understand the old code, then
you probably are not qualified to write better code.
People tend not to successfully improve things they dont understand.

The 2nd issue is, ATM, i maintain swscale. If iam involved in the new
effort and understand it either because of that or because it has some
similarity then i can continue to maintain swscale. If its totally
different and i was totally not involded then i also will not maintain
it obviously.
This is something to be especially aware of in case the cleanup/new
code would be done by someone who comes, does it and leaves. you
could end up with nicer code thats then unmaintained.

PS: whats the real issue with sws ?
it evolved out of a piece yuv->rgb converter from a video player.
It evolved from that and stuff was added into it.
This is a similar situation to why ffmpeg.c needed cleanup

thx

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Dictatorship: All citizens are under surveillance, all their steps and
actions recorded, for the politicians to enforce control.
Democracy: All politicians are under surveillance, all their steps and
actions recorded, for the citizens to enforce control.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v

2023-10-14 Thread Michael Niedermayer
On Sat, Oct 14, 2023 at 04:45:39PM +0800, Logan.Lyu wrote:
[...]
> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
> b/libavcodec/aarch64/hevcdsp_epel_neon.S
> index b4ca1e4c20..e541db5430 100644
> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
> @@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon,
> export=1
>  ret
>  endfunc
>  +
> +function ff_hevc_put_hevc_epel_v4_8_neon, export=1
> +load_epel_filterb x5, x4

This is not a valid diff, some whitespaces and newlines here are not as
they should be

thx


[...]

-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Rewriting code that is poorly written but fully understood is good.
Rewriting code that one doesnt understand is a sign that one is less smart
than the original author, trying to rewrite it will not make it better.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] libavcodec/dxva2.c: fix dxva2 does not support H264 baseline profile

2023-10-14 Thread Hendrik Leppkes
On Sat, Oct 14, 2023 at 6:02 AM xyz1001  wrote:
>
> dxva2 fail to init when decode h264 with baseline profile becase 
> `prof_h264_high` does not contains `AV_PROFILE_H264_BASELINE` and 
> `dxva_check_codec_compatibility` will return error
> ---
>  libavcodec/dxva2.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/dxva2.c b/libavcodec/dxva2.c
> index d7bc587562..e6b83f89cc 100644
> --- a/libavcodec/dxva2.c
> +++ b/libavcodec/dxva2.c
> @@ -61,7 +61,8 @@ typedef struct dxva_mode {
>  static const int prof_mpeg2_main[]   = {AV_PROFILE_MPEG2_SIMPLE,
>  AV_PROFILE_MPEG2_MAIN,
>  AV_PROFILE_UNKNOWN};
> -static const int prof_h264_high[]= {AV_PROFILE_H264_CONSTRAINED_BASELINE,
> +static const int prof_h264_high[]= {AV_PROFILE_H264_BASELINE,
> +AV_PROFILE_H264_CONSTRAINED_BASELINE,
>  AV_PROFILE_H264_MAIN,
>  AV_PROFILE_H264_HIGH,
>  AV_PROFILE_UNKNOWN};

Baseline is not compatible with main/high profile accelerators.
There is only one profile defined by DXVA2 that supports Baseline
completely, and I have never seen a GPU expose it - and we don't
support it.

- Hendrik
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] ffprobe: fix XML rendering, review XML layout

2023-10-14 Thread Stefano Sabatini
Fix rendering of int values within a side data element, which was
broken since commit d2d3a83ad93, where the side data element was
correctly marked as a variable fields element. Logic to render a
string variable was implemented already, but it was not implemented
for the int fields path, which was enabled by that commit.

Also, code and schema is changed in order to account for multiple
variable-fields elements - such as side data, contained within the
same parent. Previously it was assumed that a single variable-fields
element was contained within the parent, which was the case for tags,
but is not the case for side-data.

Previously data was rendered as:




Now as:

   
   
   
   
   
   
   
   


Now variable-fields elements are rendered with a containing element
containing generic key/values elements, enabling use of strict XML
schema.

Fix trac issue:
https://trac.ffmpeg.org/ticket/10613
---
 Changelog  |  2 ++
 doc/ffprobe.xsd| 29 --
 fftools/ffprobe.c  | 51 +++---
 tests/ref/fate/ffprobe_xml | 26 ---
 4 files changed, 77 insertions(+), 31 deletions(-)

diff --git a/Changelog b/Changelog
index 0c73f66546..259180e190 100644
--- a/Changelog
+++ b/Changelog
@@ -35,6 +35,8 @@ version :
 - CRI USM demuxer
 - ffmpeg CLI '-top' option deprecated in favor of the setfield filter
 - VAAPI AV1 encoder
+- ffprobe XML output schema changed to account for multiple
+  variable-fields elements within the same parent element
 
 
 version 6.0:
diff --git a/doc/ffprobe.xsd b/doc/ffprobe.xsd
index 87ca265d63..6b815a89df 100644
--- a/doc/ffprobe.xsd
+++ b/doc/ffprobe.xsd
@@ -43,9 +43,13 @@
 
 
 
+
+  
+
+
 
   
-
+
 
   
 
@@ -69,14 +73,23 @@
 
 
 
+
 
-
-
+
+
+
+
+
+
+
+
+
+
 
 
 
   
-
+
 
 
   
@@ -209,7 +222,7 @@
 
   
 
-
+
 
   
 
@@ -270,7 +283,7 @@
 
 
   
-
+
 
   
 
@@ -283,7 +296,7 @@
 
 
   
-
+
   
 
   
@@ -325,7 +338,7 @@
 
 
   
-
+
   
 
   
diff --git a/fftools/ffprobe.c b/fftools/ffprobe.c
index 40bb3f46e1..9db266d3fb 100644
--- a/fftools/ffprobe.c
+++ b/fftools/ffprobe.c
@@ -268,8 +268,8 @@ static struct section sections[] = {
 [SECTION_ID_PACKETS_AND_FRAMES] = { SECTION_ID_PACKETS_AND_FRAMES, 
"packets_and_frames", SECTION_FLAG_IS_ARRAY, { SECTION_ID_PACKET, -1} },
 [SECTION_ID_PACKET] = { SECTION_ID_PACKET, "packet", 0, { 
SECTION_ID_PACKET_TAGS, SECTION_ID_PACKET_SIDE_DATA_LIST, -1 } },
 [SECTION_ID_PACKET_TAGS] ={ SECTION_ID_PACKET_TAGS, "tags", 
SECTION_FLAG_HAS_VARIABLE_FIELDS, { -1 }, .element_name = "tag", .unique_name = 
"packet_tags" },
-[SECTION_ID_PACKET_SIDE_DATA_LIST] ={ SECTION_ID_PACKET_SIDE_DATA_LIST, 
"side_data_list", SECTION_FLAG_IS_ARRAY, { SECTION_ID_PACKET_SIDE_DATA, -1 }, 
.element_name = "side_data", .unique_name = "packet_side_data_list" },
-[SECTION_ID_PACKET_SIDE_DATA] = { SECTION_ID_PACKET_SIDE_DATA, 
"side_data", SECTION_FLAG_HAS_VARIABLE_FIELDS|SECTION_FLAG_HAS_TYPE, { -1 }, 
.unique_name = "packet_side_data", .get_type = get_packet_side_data_type },
+[SECTION_ID_PACKET_SIDE_DATA_LIST] ={ SECTION_ID_PACKET_SIDE_DATA_LIST, 
"side_data_list", SECTION_FLAG_IS_ARRAY, { SECTION_ID_PACKET_SIDE_DATA, -1 }, 
.element_name = "side_data_list", .unique_name = "packet_side_data_list" },
+[SECTION_ID_PACKET_SIDE_DATA] = { SECTION_ID_PACKET_SIDE_DATA, 
"side_data", SECTION_FLAG_HAS_VARIABLE_FIELDS|SECTION_FLAG_HAS_TYPE, { -1 }, 
.unique_name = "packet_side_data", .element_name = "side_datum", .get_type = 
get_packet_side_data_type },
 [SECTION_ID_PIXEL_FORMATS] =  { SECTION_ID_PIXEL_FORMATS, 
"pixel_formats", SECTION_FLAG_IS_ARRAY, { SECTION_ID_PIXEL_FORMAT, -1 } },
 [SECTION_ID_PIXEL_FORMAT] =   { SECTION_ID_PIXEL_FORMAT, 
"pixel_format", 0, { SECTION_ID_PIXEL_FORMAT_FLAGS, 
SECTION_ID_PIXEL_FORMAT_COMPONENTS, -1 } },
 [SECTION_ID_PIXEL_FORMAT_FLAGS] = { SECTION_ID_PIXEL_FORMAT_FLAGS, 
"flags", 0, { -1 }, .unique_name = "pixel_format_flags" },
@@ -292,7 +292,7 @@ static struct section sections[] = {
 [SECTION_ID_STREAM_DISPOSITION] = { SECTION_ID_STREAM_DISPOSITION, 
"disposition", 0, { -1 }, .unique_name = "stream_disposition" },
 [SECTION_ID_STREAM_TAGS] ={ SECTION_ID_STREAM_TAGS, "tags", 
SECTION_FLAG_HAS_VARIABLE_FIELDS, { -1 }, .element_name = "tag", .unique_name = 
"stream_tags" },
 [SECTION_ID_STREAM_SIDE_DATA_LIST] ={ SECTION_ID_STREAM_SIDE_DATA_LIST, 
"side_data_list", SECTION_FLAG_IS_ARRAY, { SECTION_ID_STREAM_SIDE_DATA, -1 }, 
.element_name = "side_data", .unique_name = "stream_sid

Re: [FFmpeg-devel] SWS cleanup / SPI Funding Suggestion

2023-10-14 Thread Niklas Haas
On Sat, 14 Oct 2023 19:00:36 +0200 Michael Niedermayer  
wrote:
> Well there are 2 further aspects with that.
> 
> The first one is bluntly put. If you dont understand the old code, then
> you probably are not qualified to write better code.
> People tend not to successfully improve things they dont understand.

I have a deep understanding of colorspaces and the necessary conversion
steps between them, and am also in a good position to integrate
libplacebo as a possible backend.

However, I do not have a good understanding of CPU/SIMD code, nor the
various swscale internals, beyond the cursory investigation I needed for
some recent swscale bugs I encountered. So I'll definitely need some
help along the way to fully understand those swscale internals.

> The 2nd issue is, ATM, i maintain swscale. If iam involved in the new
> effort and understand it either because of that or because it has some
> similarity then i can continue to maintain swscale. If its totally
> different and i was totally not involded then i also will not maintain
> it obviously.

I think it would be possible to join forces to the extent needed to
arrive at a satisfactory result. At the very least, I have very limited
experience working with "irregular" packed formats.

Obviously, my intent is not to blanket discard the swscale internals. It
has years and years of optimized kernels for various platforms just
lying around, wanting to be used. Hence my proposal of redesigning the
high-level logic, rather than the low-level details.

> This is something to be especially aware of in case the cleanup/new
> code would be done by someone who comes, does it and leaves. you
> could end up with nicer code thats then unmaintained.
> 
> PS: whats the real issue with sws ?
> it evolved out of a piece yuv->rgb converter from a video player.
> It evolved from that and stuff was added into it.
> This is a similar situation to why ffmpeg.c needed cleanup

Yes, it amounts to the usual disentangling of various special cases and
branches into one top-down control flow that knows about all of these
special cases and fast/slow paths to begin with.

My goal is to arrive at a place where we have one single code flow that
looks something like:

1. Settle the complete descriptions of the source and destination format/csp
2. Establish a list of operations to get from A to B, taking into
   account user settings
3. Determine and dispatch the best available functions for each operation

With the necessary code separation and/or layers of abstraction in place
to make this design manageable. In particular, steps 1 and 2 should be
expanded to include things like conversion between primaries, conversion
between HDR and SDR, conversion between YUV/RGB, and so on.

In particular, I also want to eventually add the ability to plug "Apply
a 3DLUT" in as a possible operation type for colorspace conversion,
probably by sharing the code that is already written for vf_lut3d.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] SWS cleanup / SPI Funding Suggestion

2023-10-14 Thread Anton Khirnov
Quoting Kieran Kunhya (2023-10-14 16:19:49)
> On Sat, 14 Oct 2023 at 00:17, Cosmin Stejerean via ffmpeg-devel <
> ffmpeg-devel@ffmpeg.org> wrote:
> 
> >
> >
> > > On Oct 13, 2023, at 4:00 PM, Vittorio Giovara <
> > vittorio.giov...@gmail.com> wrote:
> > >
> > > TBF this is in part why i was suggesting a new library - I feel like sws
> > is
> > > affected by bad brading because of these caching issues and imprecise
> > > conversion, and a new clean api in a new library would make a lot of
> > sense
> > > in my opinion.
> >
> > I think the branding issue would solve itself in short order if the actual
> > implementation of swscale started to be good. My concern with adding a new
> > library is that we'd end up in a situation where we have both swscale and a
> > new library side by side for some extended period of time.
> >
> > By comparison adding cleaner APIs to swscale and then slowly strangling
> > the old APIs (along the lines of Niklas' proposal) would allow for a more
> > gradual transition that has a higher likelihood of success compared to a
> > full rewrite IMO.
> >
> 
> The issue is not the API, the issue is that swscale is astonishingly
> complex and difficult to understand internally, there are lots of different
> codepaths and randomly you'll end up with a buggy or slow one and have no
> idea how to fix it.
> 
> It's probably easier to start from scratch than to try and understand and
> then fix swscale (years of work).

I've seen more than one attempt to do that over the years, all failed.
While I do agree that sws code is atrociously bad, I think that
* an attempt to reinvent it from scratch is quite likely to fail and
  produce nothing of value
* sws can be incrementally improved

-- 
Anton Khirnov
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] SWS cleanup / SPI Funding Suggestion

2023-10-14 Thread Kieran Kunhya
On Sat, 14 Oct 2023, 18:00 Michael Niedermayer, 
wrote:

> On Sat, Oct 14, 2023 at 03:19:49PM +0100, Kieran Kunhya wrote:
> > On Sat, 14 Oct 2023 at 00:17, Cosmin Stejerean via ffmpeg-devel <
> > ffmpeg-devel@ffmpeg.org> wrote:
> >
> > >
> > >
> > > > On Oct 13, 2023, at 4:00 PM, Vittorio Giovara <
> > > vittorio.giov...@gmail.com> wrote:
> > > >
> > > > TBF this is in part why i was suggesting a new library - I feel like
> sws
> > > is
> > > > affected by bad brading because of these caching issues and imprecise
> > > > conversion, and a new clean api in a new library would make a lot of
> > > sense
> > > > in my opinion.
> > >
> > > I think the branding issue would solve itself in short order if the
> actual
> > > implementation of swscale started to be good. My concern with adding a
> new
> > > library is that we'd end up in a situation where we have both swscale
> and a
> > > new library side by side for some extended period of time.
> > >
> > > By comparison adding cleaner APIs to swscale and then slowly strangling
> > > the old APIs (along the lines of Niklas' proposal) would allow for a
> more
> > > gradual transition that has a higher likelihood of success compared to
> a
> > > full rewrite IMO.
> > >
> >
> > The issue is not the API, the issue is that swscale is astonishingly
> > complex and difficult to understand internally, there are lots of
> different
> > codepaths
>
> > and randomly you'll end up with a buggy or slow one
>
> randomly ?
> code in general doesnt give you randomly something very different.
>

Come on, there's no need to be facetious. Change the PIX_FMT to a the same
sampling but a different packing and you a get totally different codepath,
sometimes just decides to use C. Likewise with any of the lightly
documented flags, you can have drastic changes in speed and quality unless
you pre-test all the code paths.


> So, why do i complain? because swscale has real issues and needs
> to be improved. And these comments point in the wrong direction
>
>
> > and have no
> > idea how to fix it.
>
> If you dont know how to fix it yourself, sending me a bug report is
> probably a good start.
>

Covered in here: https://trac.ffmpeg.org/wiki/swscale

Yuv422p10 to yuv420p10 has forced and useless and CPU costly scaling of the
luma channel with 32 bit intermediates last time I looked. All to be
shifted back to the original values.



>
>
> >
> > It's probably easier to start from scratch than to try and understand and
> > then fix swscale (years of work).
>
> Well there are 2 further aspects with that.
>
> The first one is bluntly put. If you dont understand the old code, then
> you probably are not qualified to write better code.
> People tend not to successfully improve things they dont understand.
>

I'm pretty sure you don't need to understand self-modifying code to write a
scaler.

The rest is covered here:
https://trac.ffmpeg.org/wiki/swscale


> The 2nd issue is, ATM, i maintain swscale. If iam involved in the new
> effort and understand it either because of that or because it has some
> similarity then i can continue to maintain swscale. If its totally
> different and i was totally not involded then i also will not maintain
> it obviously.
> This is something to be especially aware of in case the cleanup/new
> code would be done by someone who comes, does it and leaves. you
> could end up with nicer code thats then unmaintained.
>

Nicer code can be understood by more than one person.

Let's be honest you would block any attempt to even start removing cruft in
swscale like mmx, self-modifying code etc.


> PS: whats the real issue with sws ?
> it evolved out of a piece yuv->rgb converter from a video player.
> It evolved from that and stuff was added into it.
> This is a similar situation to why ffmpeg.c needed cleanup
>

Hmmm, building a simple thing for something and then "stuff being added",
that sounds like the arguments a few of us have been making in another
thread.

Regards,
Kieran Kunhya



> thx
>
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> Dictatorship: All citizens are under surveillance, all their steps and
> actions recorded, for the politicians to enforce control.
> Democracy: All politicians are under surveillance, all their steps and
> actions recorded, for the citizens to enforce control.
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] SWS cleanup / SPI Funding Suggestion

2023-10-14 Thread Stefano Sabatini
On date Friday 2023-10-13 21:19:34 +0200, Michael Niedermayer wrote:
> Hi everyone
> 
> I propose using 15k$ from SPI for funding sws cleanup work.
> this is substantially less than what people belive this needs (see IRC logs 
> from yesterday or so)
> So it really is more a small price for a good deed and not proper payment.
> This of course is only available to competent developers. (exact rules or how 
> thats determined
> would still need to be decided unless its a clear case)
> Also the exact outcome and goal would need to be discussed by the community 
> and whoever
> does the work.
> But some goals would probably be to make sws
> * pleasent to work with
> * similar speed or faster
> * proper multithreading
> * proper full colorspace convertion not ignoring gamma, primaries, ...
> * clean / understandable modular design (maybe everything can be a "Filter" 
> inside sws
>   that get build into a chain)
> 
> Proper payment (50k$ maybe) would be too much in relation to what SPI has ATM 
> (150k$)
> 
> Above all, this is just my oppinion, the actual SPI funding also would need to
> be approved by the community. This can happen after a specific volunteer 
> comes forth
> or before, whichever way the community prefers.

Leaving apart the technical details about the implementation, this
should be feasible within the SPI framework (although this would
involve some paperwork and delays due to that).

It would be useful at this point to define the process to accept the
proposal and potential candidates. We have a technical committee which
might take the lead on that and probably have the last word on it,
since "approved by the community" is a bit vague and there is the risk
that there will be never an approval "from the community" because of
diverging views, or that we get stuck at the design level.

As a start, probably there should be a design doc somewhere, discussed
by the community and finally approved (by the technical committee??)
before we present the request and candidate to SPI. In fact probably
the design doc is the first thing a candidate might need to work on.

Also I'd avoid terms such as "rewrite" or "cleanup" since they have
bad connotations, maybe let's call it "review" or "refinement".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] SWS cleanup / SPI Funding Suggestion

2023-10-14 Thread Vittorio Giovara
On Sat, Oct 14, 2023 at 1:00 PM Michael Niedermayer 
wrote:

>
> PS: whats the real issue with sws ?
> it evolved out of a piece yuv->rgb converter from a video player.
> It evolved from that and stuff was added into it.
> This is a similar situation to why ffmpeg.c needed cleanup
>

I'll give you two real issues:
* It's based on an archaic design that doesn't let people contribute to it
easily, and thus it's not very extensible. New code paths *can* be added,
but it's very difficult and it can lead easily astray, with often
unpredictable conversions and bugs subtly introduced.
* It's prefixed with sw- while the rest of libraries are prefixed with av-
(and to my understanding there is no real reason behind this) (yes i'm
aware of lswr).

I think the compromise of having a backend-based API backed by different
libraries is the way forward, no cleanups, no rewrites.
-- 
Vittorio
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] ffprobe: fix XML rendering, review XML layout

2023-10-14 Thread Stefano Sabatini
On date Saturday 2023-10-14 19:24:28 +0200, Stefano Sabatini wrote:
> Fix rendering of int values within a side data element, which was
> broken since commit d2d3a83ad93, where the side data element was
> correctly marked as a variable fields element. Logic to render a
> string variable was implemented already, but it was not implemented
> for the int fields path, which was enabled by that commit.
[...]

V2 with schema fixes.

I'd like to include the fixed version with 6.1 (will probably push in
a few days if I see no comments).
>From 1a22ba50c19eacdd2e1427cf4feeae4f10781233 Mon Sep 17 00:00:00 2001
From: Stefano Sabatini 
Date: Fri, 13 Oct 2023 02:06:21 +0200
Subject: [PATCH] ffprobe: fix XML rendering, review XML layout

Fix rendering of int values within a side data element, which was
broken since commit d2d3a83ad93, where the side data element was
correctly marked as a variable fields element. Logic to render a
string variable was implemented already, but it was not implemented
for the int fields path, which was enabled by that commit.

Also, code and schema is changed in order to account for multiple
variable-fields elements - such as side data, contained within the
same parent. Previously it was assumed that a single variable-fields
element was contained within the parent, which was the case for tags,
but is not the case for side-data.

Previously data was rendered as:




Now as:

   
   
   
   
   
   
   
   


Now variable-fields elements are rendered with a containing element
containing generic key/values elements, enabling use of strict XML
schema.

Fix trac issue:
https://trac.ffmpeg.org/ticket/10613
---
 Changelog  |  2 ++
 doc/ffprobe.xsd| 31 +---
 fftools/ffprobe.c  | 49 --
 tests/ref/fate/ffprobe_xml | 26 +---
 4 files changed, 78 insertions(+), 30 deletions(-)

diff --git a/Changelog b/Changelog
index 0c73f66546..259180e190 100644
--- a/Changelog
+++ b/Changelog
@@ -35,6 +35,8 @@ version :
 - CRI USM demuxer
 - ffmpeg CLI '-top' option deprecated in favor of the setfield filter
 - VAAPI AV1 encoder
+- ffprobe XML output schema changed to account for multiple
+  variable-fields elements within the same parent element
 
 
 version 6.0:
diff --git a/doc/ffprobe.xsd b/doc/ffprobe.xsd
index 87ca265d63..aa2e870f70 100644
--- a/doc/ffprobe.xsd
+++ b/doc/ffprobe.xsd
@@ -43,9 +43,15 @@
 
 
 
-
+
   
 
+  
+
+
+
+  
+
 
   
 
@@ -69,14 +75,23 @@
 
 
 
+
 
-
-
+
+
+
+
+
+
+
+
+
+
 
 
 
   
-
+
 
 
   
@@ -209,7 +224,7 @@
 
   
 
-
+
 
   
 
@@ -270,7 +285,7 @@
 
 
   
-
+
 
   
 
@@ -283,7 +298,7 @@
 
 
   
-
+
   
 
   
@@ -325,7 +340,7 @@
 
 
   
-
+
   
 
   
diff --git a/fftools/ffprobe.c b/fftools/ffprobe.c
index 40bb3f46e1..e490a9a9b2 100644
--- a/fftools/ffprobe.c
+++ b/fftools/ffprobe.c
@@ -269,7 +269,7 @@ static struct section sections[] = {
 [SECTION_ID_PACKET] = { SECTION_ID_PACKET, "packet", 0, { SECTION_ID_PACKET_TAGS, SECTION_ID_PACKET_SIDE_DATA_LIST, -1 } },
 [SECTION_ID_PACKET_TAGS] ={ SECTION_ID_PACKET_TAGS, "tags", SECTION_FLAG_HAS_VARIABLE_FIELDS, { -1 }, .element_name = "tag", .unique_name = "packet_tags" },
 [SECTION_ID_PACKET_SIDE_DATA_LIST] ={ SECTION_ID_PACKET_SIDE_DATA_LIST, "side_data_list", SECTION_FLAG_IS_ARRAY, { SECTION_ID_PACKET_SIDE_DATA, -1 }, .element_name = "side_data", .unique_name = "packet_side_data_list" },
-[SECTION_ID_PACKET_SIDE_DATA] = { SECTION_ID_PACKET_SIDE_DATA, "side_data", SECTION_FLAG_HAS_VARIABLE_FIELDS|SECTION_FLAG_HAS_TYPE, { -1 }, .unique_name = "packet_side_data", .get_type = get_packet_side_data_type },
+[SECTION_ID_PACKET_SIDE_DATA] = { SECTION_ID_PACKET_SIDE_DATA, "side_data", SECTION_FLAG_HAS_VARIABLE_FIELDS|SECTION_FLAG_HAS_TYPE, { -1 }, .unique_name = "packet_side_data", .element_name = "side_datum", .get_type = get_packet_side_data_type },
 [SECTION_ID_PIXEL_FORMATS] =  { SECTION_ID_PIXEL_FORMATS, "pixel_formats", SECTION_FLAG_IS_ARRAY, { SECTION_ID_PIXEL_FORMAT, -1 } },
 [SECTION_ID_PIXEL_FORMAT] =   { SECTION_ID_PIXEL_FORMAT, "pixel_format", 0, { SECTION_ID_PIXEL_FORMAT_FLAGS, SECTION_ID_PIXEL_FORMAT_COMPONENTS, -1 } },
 [SECTION_ID_PIXEL_FORMAT_FLAGS] = { SECTION_ID_PIXEL_FORMAT_FLAGS, "flags", 0, { -1 }, .unique_name = "pixel_format_flags" },
@@ -292,7 +292,7 @@ static struct section sections[] = {
 [SECTION_ID_STREAM_DISPOSITION] = { SECTION_ID_STREAM_DISPOSITION, "disposition", 0, { -1 }, .unique_name = "stream_disposition" },
 [SECTION_ID_STREAM_TAGS

Re: [FFmpeg-devel] [PATCH 2/7] avcodec/h261dec, vc1dec: Don't set write-only macroblock dimensions

2023-10-14 Thread Andreas Rheinhardt
Michael Niedermayer:
> On Sat, Oct 07, 2023 at 02:40:26AM +0200, Andreas Rheinhardt wrote:
>> They are generally set in ff_mpv_init_context_frame()
>> (mostly called by ff_mpv_common_init()); setting them
>> somewhere else should be avoided.
>>
>> Signed-off-by: Andreas Rheinhardt 
>> ---
>>  libavcodec/h261dec.c | 6 --
>>  libavcodec/vc1dec.c  | 3 ---
>>  2 files changed, 9 deletions(-)
> 
> This seems to break several vc1 files like
> vlcticket/5887/Cruise\ 2012_07_29_19_02_16.wmv
> 
> I think its there:
> https://streams.videolan.org/issues/5887/
> 

Thanks for testing, much appreciated. The reason for this is line 968,
where mb_height is used before ff_mpv_common_init() is called. This
actually points to a potential bug: On frame size changes (can happen in
ff_vc1_decode_entry_point()), this line would still use the old value of
mb_height. But I don't want to deal with this at the moment, so I'll
simply drop the vc1dec.c stuff from this patch and apply only h261dec.c.

- Andreas

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] avformat/mov: The iloc test is not redundant

2023-10-14 Thread Michael Niedermayer
Fixes: Assertion failure
Fixes: 
62866/clusterfuzz-testcase-minimized-ffmpeg_dem_MOV_fuzzer-5282997370486784

Found-by: continuous fuzzing process 
https://github.com/google/oss-fuzz/tree/master/projects/ffmpeg
Signed-off-by: Michael Niedermayer 
---
 libavformat/mov.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/libavformat/mov.c b/libavformat/mov.c
index 2f29487beb..34691d0cda 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -7788,11 +7788,10 @@ static int mov_read_iloc(MOVContext *c, AVIOContext 
*pb, MOVAtom atom)
 return 0;
 }
 
-if (c->avif_info) {
-av_log(c->fc, AV_LOG_INFO, "Duplicate iloc box found\n");
+if (c->avif_info || c->fc->nb_streams) {
+av_log(c->fc, AV_LOG_INFO, "Duplicate or invalid iloc box found\n");
 return 0;
 }
-av_assert0(!c->fc->nb_streams);
 
 version = avio_r8(pb);
 avio_rb24(pb);  // flags.
-- 
2.17.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2] avcodec/jpegxl_parser: fix OOB read regression

2023-10-14 Thread Leo Izen
In f7ac3512f5b5cb8eb149f37300b43461d8e93af3 the size of the dynamically
allocated buffer was shrunk, but it was made too small for very small
alphabet sizes. This patch restores the size to prevent an OOB read.

Reported-by: Cole Dilorenzo 
Signed-off-by: Leo Izen 
---
 libavcodec/jpegxl_parser.c | 30 +-
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/libavcodec/jpegxl_parser.c b/libavcodec/jpegxl_parser.c
index dde36b0d6e..630fc8a60b 100644
--- a/libavcodec/jpegxl_parser.c
+++ b/libavcodec/jpegxl_parser.c
@@ -683,7 +683,7 @@ static int read_vlc_prefix(GetBitContext *gb, 
JXLEntropyDecoder *dec, JXLSymbolD
 int repeat_count_prev = 0, repeat_count_zero = 0, prev = 8;
 int total_code = 0, len, hskip, num_codes = 0, ret;
 
-VLC level1_vlc;
+VLC level1_vlc = { 0 };
 
 if (dist->alphabet_size == 1) {
 dist->vlc.bits = 0;
@@ -709,8 +709,10 @@ static int read_vlc_prefix(GetBitContext *gb, 
JXLEntropyDecoder *dec, JXLSymbolD
 }
 }
 
-if (total_code != 32 && num_codes >= 2 || num_codes < 1)
-return AVERROR_INVALIDDATA;
+if (total_code != 32 && num_codes >= 2 || num_codes < 1) {
+ret = AVERROR_INVALIDDATA;
+goto end;
+}
 
 for (int i = 1; i < 19; i++)
  level1_codecounts[i] += level1_codecounts[i - 1];
@@ -726,7 +728,7 @@ static int read_vlc_prefix(GetBitContext *gb, 
JXLEntropyDecoder *dec, JXLSymbolD
 if (ret < 0)
 goto end;
 
-buf = av_mallocz(dist->alphabet_size * (2 * sizeof(int8_t) + 
sizeof(int16_t) + sizeof(uint32_t))
+buf = av_mallocz(MAX_PREFIX_ALPHABET_SIZE * (2 * sizeof(int8_t) + 
sizeof(int16_t) + sizeof(uint32_t))
  + sizeof(uint32_t));
 if (!buf) {
 ret = AVERROR(ENOMEM);
@@ -734,21 +736,22 @@ static int read_vlc_prefix(GetBitContext *gb, 
JXLEntropyDecoder *dec, JXLSymbolD
 }
 
 level2_lens = (int8_t *)buf;
-level2_lens_s = (int8_t *)(buf + dist->alphabet_size * sizeof(int8_t));
-level2_syms = (int16_t *)(buf + dist->alphabet_size * (2 * 
sizeof(int8_t)));
-level2_codecounts = (uint32_t *)(buf + dist->alphabet_size * (2 * 
sizeof(int8_t) + sizeof(int16_t)));
+level2_lens_s = (int8_t *)(buf + MAX_PREFIX_ALPHABET_SIZE * 
sizeof(int8_t));
+level2_syms = (int16_t *)(buf + MAX_PREFIX_ALPHABET_SIZE * (2 * 
sizeof(int8_t)));
+level2_codecounts = (uint32_t *)(buf + MAX_PREFIX_ALPHABET_SIZE * (2 * 
sizeof(int8_t) + sizeof(int16_t)));
 
 total_code = 0;
 for (int i = 0; i < dist->alphabet_size; i++) {
 len = get_vlc2(gb, level1_vlc.table, 5, 1);
+if (get_bits_left(gb) < 0) {
+ret = AVERROR_BUFFER_TOO_SMALL;
+goto end;
+}
 if (len == 16) {
 int extra = 3 + get_bits(gb, 2);
 if (repeat_count_prev)
-extra = 4 * (repeat_count_prev - 2) - repeat_count_prev + 
extra;
-if (i + extra > dist->alphabet_size) {
-ret = AVERROR_INVALIDDATA;
-goto end;
-}
+extra += 4 * (repeat_count_prev - 2) - repeat_count_prev;
+extra = FFMIN(extra, dist->alphabet_size - i);
 for (int j = 0; j < extra; j++)
 level2_lens[i + j] = prev;
 total_code += (32768 >> prev) * extra;
@@ -759,7 +762,8 @@ static int read_vlc_prefix(GetBitContext *gb, 
JXLEntropyDecoder *dec, JXLSymbolD
 } else if (len == 17) {
 int extra = 3 + get_bits(gb, 3);
 if (repeat_count_zero > 0)
-extra = 8 * (repeat_count_zero - 2) - repeat_count_zero + 
extra;
+extra += 8 * (repeat_count_zero - 2) - repeat_count_zero;
+extra = FFMIN(extra, dist->alphabet_size - i);
 i += extra - 1;
 repeat_count_prev = 0;
 repeat_count_zero += extra;
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] ffprobe: fix XML rendering, review XML layout

2023-10-14 Thread Timo Rothenpieler

On 14.10.2023 19:24, Stefano Sabatini wrote:

Fix rendering of int values within a side data element, which was
broken since commit d2d3a83ad93, where the side data element was
correctly marked as a variable fields element. Logic to render a
string variable was implemented already, but it was not implemented
for the int fields path, which was enabled by that commit.

Also, code and schema is changed in order to account for multiple
variable-fields elements - such as side data, contained within the
same parent. Previously it was assumed that a single variable-fields
element was contained within the parent, which was the case for tags,
but is not the case for side-data.

Previously data was rendered as:

 


Now as:












Isn't a change like that practically an ABI break, and thus would need 
to happen on a major bump?


Alternatively, just leave the old fields as they were, they looks like 
they can coexist with the new ones. At least XML wise.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v3 14/14] vvcdec: add full vvc decoder

2023-10-14 Thread Nuo Mi
On Tue, Sep 5, 2023 at 9:16 PM Nuo Mi  wrote:

>
>>
>> #14 0x55aa01f2 in avformat_find_stream_info (ic=0x58698900,
>> options=0x5869a3c0) at libavformat/demux.c:2771
>> #15 0x55697446 in ifile_open (o=0x7fffdaa0,
>> filename=0x7fffe56b "fate-suite//vvc-conformance/SUBPIC_A_3.bit") at
>> fftools/ffmpeg_demux.c:1538
>> #16 0x556baaf6 in open_files (l=0x58698218,
>> inout=0x5687dc92 "input", open_file=0x55696649 ) at
>> fftools/ffmpeg_opt.c:1284
>> #17 0x556bac9e in ffmpeg_parse_options (argc=22,
>> argv=0x7fffe1c8) at fftools/ffmpeg_opt.c:1324
>> #18 0x556c8c06 in main (argc=22, argv=0x7fffe1c8) at
>> fftools/ffmpeg.c:1314
>>
> thank you for the stack,
> It's introduced by parameter set parser refact.
> fixed by
> https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=9627
> and
> https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=9626
>

Hi all,
Do you have any more comments regarding the VVC decoder C code?
If not, I will send a new version, targeting for merge.
Thank you.

>
>
>> [...]
>>
>>
>> --
>> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>>
>> Everything should be made as simple as possible, but not simpler.
>> -- Albert Einstein
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] ffprobe: fix XML rendering, review XML layout

2023-10-14 Thread Stefano Sabatini
On date Sunday 2023-10-15 03:09:14 +0200, Timo Rothenpieler wrote:
> On 14.10.2023 19:24, Stefano Sabatini wrote:
> > Fix rendering of int values within a side data element, which was
> > broken since commit d2d3a83ad93, where the side data element was
> > correctly marked as a variable fields element. Logic to render a
> > string variable was implemented already, but it was not implemented
> > for the int fields path, which was enabled by that commit.
> > 
> > Also, code and schema is changed in order to account for multiple
> > variable-fields elements - such as side data, contained within the
> > same parent. Previously it was assumed that a single variable-fields
> > element was contained within the parent, which was the case for tags,
> > but is not the case for side-data.
> > 
> > Previously data was rendered as:
> > 
> >   > min_bitrate="0" avg_bitrate="0" buffer_size="327680" vbv_delay="-1"/>
> > 
> > 
> > Now as:
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> 

> Isn't a change like that practically an ABI break, and thus would need to
> happen on a major bump?

Yes, but in practice we are not tracking changes in the XML format,
and major bumps are more related to ABI changes rather than to
application level functionality, and probably there are not so many
users using the XML format anyway.

> Alternatively, just leave the old fields as they were, they looks like they
> can coexist with the new ones. At least XML wise.

Yes, but note that compliancy with the XSD is broken since side data
printing addition, so I should fix at least that one, or revert the
change/fix on the compact output (and keep strict XSD schema broken)
which I'd rather not do.

In my view fixing the side data output can be seen as a fix, so it
should not entail a major bump. OTOH I could keep the old layout for
the tags, but that would imply adding a special rule which I would
like to avoid, and having all the tags grouped together has its own
merits (simplifies some queries).

So at the end I think that breaking the format backward-compatibility
is the least evil, and we should be fine with a minor bump.

Alternatively we might even consider to do a major bump, which seems a
bit overkill and might suggest the idea that we are breaking ABI
compatibility, which is not the case here.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] libavutil/ppc/cpu.c: check that AT_HWCAP2 is defined

2023-10-14 Thread Sean McGovern
It was not introduced until glibc 2.18.
---
This should fix the ppc32 FATE node.
---
 libavutil/ppc/cpu.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libavutil/ppc/cpu.c b/libavutil/ppc/cpu.c
index 96b491c716..bc8bb5f47c 100644
--- a/libavutil/ppc/cpu.c
+++ b/libavutil/ppc/cpu.c
@@ -95,12 +95,15 @@ int ff_get_cpu_flags_ppc(void)
 #endif
 if (ret & AV_CPU_FLAG_VSX)
 av_assert0(ret & AV_CPU_FLAG_ALTIVEC);
-} else if (buf[i] == AT_HWCAP2) {
+}
+#ifdef AT_HWCAP2 /* not introduced until glibc 2.18 */
+else if (buf[i] == AT_HWCAP2) {
 #ifdef PPC_FEATURE2_ARCH_2_07
 if (buf[i + 1] & PPC_FEATURE2_ARCH_2_07)
 ret |= AV_CPU_FLAG_POWER8;
 #endif
 }
+#endif /* AT_HWCAP2 */
 }
 }
 
-- 
2.39.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".