[FFmpeg-cvslog] armv6: Accelerate ff_imdct_half for general case (mdct_bits != 6)

2014-07-13 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Fri Jul 11 
00:14:28 2014 +0100| [42c1cc35b7623ce76c7b55c6bc100f135e17cd4f] | committer: 
Michael Niedermayer

armv6: Accelerate ff_imdct_half for general case (mdct_bits != 6)

The previous implementation targeted DTS Coherent Acoustics, which only
requires mdct_bits == 6. This relatively small size lent itself to
unrolling the loops a small number of times, and encoding offsets
calculated at assembly time within the load/store instructions of each
iteration.

In the more general case (codecs such as AAC and AC3) much larger arrays
are used - mdct_bits == [8, 9, 11]. The old method does not scale for
these cases, so more integer registers are used with non-unrolled versions
of the loops (and with some stack spillage). The postrotation filter loop
is still unrolled by a factor of 2 to permit the double-buffering of some
VFP registers to facilitate overlap of neighbouring iterations.

I benchmarked the result by measuring the number of gperftools samples
that hit anywhere in the AAC decoder (starting from aac_decode_frame())
or specifically in ff_imdct_half_c / ff_imdct_half_vfp, for the same
example AAC stream:

  Before  After
  Mean   StdDev   Mean   StdDev  Confidence  Change
aac_decode_frame  2368.1 35.8 2117.2 35.3100.0%  +11.8%
ff_imdct_half_*   457.5  22.4 251.2  16.2100.0%  +82.1%

Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=42c1cc35b7623ce76c7b55c6bc100f135e17cd4f
---

 libavcodec/arm/mdct_vfp.S |  146 -
 1 file changed, 144 insertions(+), 2 deletions(-)

diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
index ee3984c..43f6d14 100644
--- a/libavcodec/arm/mdct_vfp.S
+++ b/libavcodec/arm/mdct_vfp.S
@@ -33,6 +33,11 @@ J0  .reqa2
 J1  .reqa4
 J2  .reqip
 J3  .reqlr
+REVTAB_HI .req  v5
+IN_HI   .reqv6
+OUT_HI  .reqv6
+TCOS_HI .reqsl
+TSIN_HI .reqfp
 
 .macro prerotation_innerloop
  .set trig_lo, k
@@ -76,6 +81,43 @@ J3  .reqlr
  .set k, k + 2
 .endm
 
+.macro prerotation_innerloop_rolled
+vldmia  TCOS!, {s16,s17}
+vldmdb  TCOS_HI!, {s18,s19}
+vldrs0, [IN_HI, #-4]
+vldrs1, [IN_HI, #-12]
+vldrs2, [IN, #12]
+vldrs3, [IN, #4]
+vmul.f  s8, s0, s16 @ vector operation
+vldmia  TSIN!, {s20,s21}
+vldmdb  TSIN_HI!, {s22,s23}
+vldrs4, [IN]
+vldrs5, [IN, #8]
+vldrs6, [IN_HI, #-16]
+vldrs7, [IN_HI, #-8]
+vmul.f  s12, s0, s20@ vector operation
+add IN, IN, #16
+sub IN_HI, IN_HI, #16
+ldrhJ0, [REVTAB], #2
+ldrhJ1, [REVTAB], #2
+vmls.f  s8, s4, s20 @ vector operation
+ldrhJ3, [REVTAB_HI, #-2]!
+ldrhJ2, [REVTAB_HI, #-2]!
+add J0, OUT, J0, lsl #3
+vmla.f  s12, s4, s16@ vector operation
+add J1, OUT, J1, lsl #3
+add J2, OUT, J2, lsl #3
+add J3, OUT, J3, lsl #3
+vstrs8, [J0]
+vstrs9, [J1]
+vstrs10, [J2]
+vstrs11, [J3]
+vstrs12, [J0, #4]
+vstrs13, [J1, #4]
+vstrs14, [J2, #4]
+vstrs15, [J3, #4]
+.endm
+
 .macro postrotation_innerloop tail, head
  .set trig_lo_head, n8 - k - 2
  .set trig_hi_head, n8 + k
@@ -142,6 +184,49 @@ J3  .reqlr
  .endif
 .endm
 
+.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, 
tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
+ .ifnc "\tail",""
+vmls.f  s8, s0, \tcos_s0_tail   @ vector operation
+ .endif
+ .ifnc "\head",""
+vldmia  TSIN!, {s16,s17}
+vldmdb  TSIN_HI!, {s18,s19}
+vldmia  TCOS!, {\tcos_s0_head,\tcos_s1_head}
+ .endif
+ .ifnc "\tail",""
+vmla.f  s12, s4, \tcos_s0_tail  @ vector operation
+ .endif
+ .ifnc "\head",""
+vldrs0, [OUT, #+\out_offset_head+0]
+vldrs1, [OUT, #+\out_offset_head+8]
+vldrs2, [OUT_HI, #-\out_offset_head-16]
+vldrs3, [OUT_HI, #-\out_offset_head-8]
+vldrs4, [OUT, #+\out_offset_head+4]
+vldrs5, [OUT, #+\out_offset_head+12]
+vldrs6, [OUT_HI, #-\out_offset_head-12]
+vldrs7, [OUT_HI, #-\out_offset_head-4]
+ .endif
+ .ifnc "\tail",""
+vstrs8, [OUT, #+\out_offset_tail+0]
+vstrs9, [OUT, #+\out_offset_tail+8]
+vstrs10, [OUT_HI, #-\out_offset_tail-16]
+vstrs11, [OUT_HI, #-\out_offset_tail-8]
+ .endif
+ .ifnc "\head",""
+vmul.f  s8, s4, s16 @ vector operation
+ .endi

[FFmpeg-cvslog] armv6: Accelerate butterflies_float

2014-07-16 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Fri Jul 11 
00:14:31 2014 +0100| [57641410d1a386937bec3fddd6c75119550916ec] | committer: 
Michael Niedermayer

armv6: Accelerate butterflies_float

I benchmarked the result by measuring the number of gperftools samples that
hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
specifically in butterflies_float_c() / ff_butterflies_float_vfp() for the
same sample AAC stream:

   Before  After
   Mean   StdDev   Mean   StdDev  Confidence  Change
Audio decode   1542.8 43.7 1470.5 41.5100.0%  +4.9%
butterflies_float  130.0  11.9 70.2   12.1100.0%  +85.2%

Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=57641410d1a386937bec3fddd6c75119550916ec
---

 libavutil/arm/float_dsp_init_vfp.c |4 ++
 libavutil/arm/float_dsp_vfp.S  |  116 
 2 files changed, 120 insertions(+)

diff --git a/libavutil/arm/float_dsp_init_vfp.c 
b/libavutil/arm/float_dsp_init_vfp.c
index 4dfe012..45508b8 100644
--- a/libavutil/arm/float_dsp_init_vfp.c
+++ b/libavutil/arm/float_dsp_init_vfp.c
@@ -32,6 +32,8 @@ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
 void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
 const float *src1, int len);
 
+void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len);
+
 av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags)
 {
 if (!have_vfpv3(cpu_flags)) {
@@ -39,4 +41,6 @@ av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, 
int cpu_flags)
 fdsp->vector_fmul_window = ff_vector_fmul_window_vfp;
 }
 fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
+if (!have_vfpv3(cpu_flags))
+fdsp->butterflies_float = ff_butterflies_float_vfp;
 }
diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S
index 13ff219..7db2452 100644
--- a/libavutil/arm/float_dsp_vfp.S
+++ b/libavutil/arm/float_dsp_vfp.S
@@ -339,3 +339,119 @@ function ff_vector_fmul_reverse_vfp, export=1
 vpop{d8-d15}
 bx  lr
 endfunc
+
+/**
+ * ARM VFP implementation of 'butterflies_float_c' function
+ * Assume that len is a positive non-zero number
+ */
+@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int 
len)
+function ff_butterflies_float_vfp, export=1
+BASE1   .reqa1
+BASE2   .reqa2
+LEN .reqa3
+OLDFPSCR .req   a4
+
+vpush   {s16-s31}
+fmrxOLDFPSCR, FPSCR
+
+tst LEN, #7
+beq 4f  @ common case: len is a multiple 
of 8
+
+ldr ip, =0x0300 @ RunFast mode, scalar mode
+fmxrFPSCR, ip
+
+tst LEN, #1
+beq 1f
+vldmia  BASE1!, {s0}
+vldmia  BASE2!, {s8}
+vadd.f  s16, s0, s8
+vsub.f  s24, s0, s8
+vstrs16, [BASE1, #0-4*1]
+vstrs24, [BASE2, #0-4*1]
+1:
+tst LEN, #2
+beq 2f
+vldmia  BASE1!, {s0-s1}
+vldmia  BASE2!, {s8-s9}
+vadd.f  s16, s0, s8
+vadd.f  s17, s1, s9
+vsub.f  s24, s0, s8
+vsub.f  s25, s1, s9
+vstrd8, [BASE1, #0-8*1]@ s16,s17
+vstrd12, [BASE2, #0-8*1]   @ s24,s25
+2:
+tst LEN, #4
+beq 3f
+vldmia  BASE1!, {s0-s1}
+vldmia  BASE2!, {s8-s9}
+vldmia  BASE1!, {s2-s3}
+vldmia  BASE2!, {s10-s11}
+vadd.f  s16, s0, s8
+vadd.f  s17, s1, s9
+vsub.f  s24, s0, s8
+vsub.f  s25, s1, s9
+vadd.f  s18, s2, s10
+vadd.f  s19, s3, s11
+vsub.f  s26, s2, s10
+vsub.f  s27, s3, s11
+vstrd8, [BASE1, #0-16*1]@ s16,s17
+vstrd12, [BASE2, #0-16*1]   @ s24,s25
+vstrd9, [BASE1, #8-16*1]@ s18,s19
+vstrd13, [BASE2, #8-16*1]   @ s26,s27
+3:
+bicsLEN, LEN, #7
+beq 7f
+4:
+ldr ip, =0x0303 @ RunFast mode, short vectors of 
length 4, stride 1
+fmxrFPSCR, ip
+
+vldmia  BASE1!, {s0-s1}
+vldmia  BASE2!, {s8-s9}
+vldmia  BASE1!, {s2-s3}
+vldmia  BASE2!, {s10-s11}
+vadd.f  s16, s0, s8
+vldmia  BASE1!, {s4-s5}
+vldmia  BASE2!, {s12-s13}
+vldmia  BASE1!, {s6-s7}
+vldmia  BASE2!, {s14-s15}
+vsub.f  s24, s0, s8
+vadd.f  s20, s4, s12
+subsLEN, LEN, #8
+beq 6f
+5:  vldmia  BASE1!, {s0-s3}
+vldmia  BASE2!, {s8-s11}
+vsub.f  s28, s4, s12
+vstrd8, [BASE1, #0-16*3]@ s16,s17
+vstrd9, [BASE1, #8-16*3]@ s18,s19
+vstrd12, [BASE2, #0-16*3]   @ s24,s25
+vstrd13, [BASE2, #8-16*3]   @ s26,s27
+ 

[FFmpeg-cvslog] armv6: Accelerate vector_fmul_window

2014-07-16 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Fri Jul 11 
00:14:30 2014 +0100| [649c666137f43542b45941f42034ab3f44a31d38] | committer: 
Michael Niedermayer

armv6: Accelerate vector_fmul_window

I benchmarked the result by measuring the number of gperftools samples that
hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
specifically in vector_fmul_window_c() / ff_vector_fmul_window_vfp() for the
same sample AAC stream:

Before  After
Mean   StdDev   Mean   StdDev  Confidence  Change
Audio decode1598.2 47.4 1529.2 25.4100.0%  +4.5%
vector_fmul_window  244.0  22.1 188.9  22.3100.0%  +29.2%

Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=649c666137f43542b45941f42034ab3f44a31d38
---

 libavutil/arm/float_dsp_init_vfp.c |7 +-
 libavutil/arm/float_dsp_vfp.S  |  204 
 2 files changed, 210 insertions(+), 1 deletion(-)

diff --git a/libavutil/arm/float_dsp_init_vfp.c 
b/libavutil/arm/float_dsp_init_vfp.c
index 1fe52ab..4dfe012 100644
--- a/libavutil/arm/float_dsp_init_vfp.c
+++ b/libavutil/arm/float_dsp_init_vfp.c
@@ -26,12 +26,17 @@
 void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1,
 int len);
 
+void ff_vector_fmul_window_vfp(float *dst, const float *src0,
+   const float *src1, const float *win, int len);
+
 void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
 const float *src1, int len);
 
 av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags)
 {
-if (!have_vfpv3(cpu_flags))
+if (!have_vfpv3(cpu_flags)) {
 fdsp->vector_fmul = ff_vector_fmul_vfp;
+fdsp->vector_fmul_window = ff_vector_fmul_window_vfp;
+}
 fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
 }
diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S
index 8695fbd..13ff219 100644
--- a/libavutil/arm/float_dsp_vfp.S
+++ b/libavutil/arm/float_dsp_vfp.S
@@ -68,6 +68,210 @@ function ff_vector_fmul_vfp, export=1
 endfunc
 
 /**
+ * ARM VFP implementation of 'vector_fmul_window_c' function
+ * Assume that len is a positive non-zero number
+ */
+@ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
+@const float *src1, const float *win, int len)
+function ff_vector_fmul_window_vfp, export=1
+DST0.reqa1
+SRC0.reqa2
+SRC1.reqa3
+WIN0.reqa4
+LEN .reqv1
+DST1.reqv2
+WIN1.reqv3
+OLDFPSCR .req   ip
+
+push{v1-v3,lr}
+ldr LEN, [sp, #4*4+0]
+vpush   {s16-s31}
+fmrxOLDFPSCR, FPSCR
+add DST1, DST0, LEN, lsl #3
+add SRC1, SRC1, LEN, lsl #2
+add WIN1, WIN0, LEN, lsl #3
+
+tst LEN, #7
+beq 4f  @ common case: len is a multiple 
of 8
+
+ldr lr, =0x0300 @ RunFast mode, scalar mode
+fmxrFPSCR, lr
+
+tst LEN, #1
+beq 1f
+vldmdb  WIN1!, {s0}
+vldmia  SRC0!, {s8}
+vldmia  WIN0!, {s16}
+vmul.f  s24, s0, s8
+vldmdb  SRC1!, {s20}
+vmul.f  s8, s16, s8
+vmls.f  s24, s16, s20
+vmla.f  s8, s0, s20
+vstmia  DST0!, {s24}
+vstmdb  DST1!, {s8}
+1:
+tst LEN, #2
+beq 2f
+vldmdb  WIN1!, {s0}
+vldmdb  WIN1!, {s1}
+vldmia  SRC0!, {s8-s9}
+vldmia  WIN0!, {s16-s17}
+vmul.f  s24, s0, s8
+vmul.f  s25, s1, s9
+vldmdb  SRC1!, {s20}
+vldmdb  SRC1!, {s21}
+vmul.f  s8, s16, s8
+vmul.f  s9, s17, s9
+vmls.f  s24, s16, s20
+vmls.f  s25, s17, s21
+vmla.f  s8, s0, s20
+vmla.f  s9, s1, s21
+vstmia  DST0!, {s24-s25}
+vstmdb  DST1!, {s8}
+vstmdb  DST1!, {s9}
+2:
+tst LEN, #4
+beq 3f
+vldmdb  WIN1!, {s0}
+vldmdb  WIN1!, {s1}
+vldmdb  WIN1!, {s2}
+vldmdb  WIN1!, {s3}
+vldmia  SRC0!, {s8-s11}
+vldmia  WIN0!, {s16-s19}
+vmul.f  s24, s0, s8
+vmul.f  s25, s1, s9
+vmul.f  s26, s2, s10
+vmul.f  s27, s3, s11
+vldmdb  SRC1!, {s20}
+vldmdb  SRC1!, {s21}
+vldmdb  SRC1!, {s22}
+vldmdb  SRC1!, {s23}
+vmul.f  s8, s16, s8
+vmul.f  s9, s17, s9
+vmul.f  s10, s18, s10
+vmul.f  s11, s19, s11
+vmls.f  s24, s16, s20
+vmls.f  s25, s17, s21
+vmls.f  s26, s18, s22
+vmls.f  s27, s19, s23
+vmla.f  s8, s0, s20
+vmla.f  s9, s1, s21
+vmla.f  s10, s2, s22
+vmla.f  s11, s3, s23
+vstmia  DST0!, {s24-s27}
+vstmdb  DST1!, {s8}
+vstmdb  DST1!, {s9}
+vstmdb  DST1!,

[FFmpeg-cvslog] armv6: Accelerate ff_imdct_half for general case (mdct_bits != 6)

2014-07-17 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Fri Jul 11 
00:12:31 2014 +0100| [5c22e8e4ad0852d61d5c4ba8d67d33fd72339497] | committer: 
Martin Storsjö

armv6: Accelerate ff_imdct_half for general case (mdct_bits != 6)

The previous implementation targeted DTS Coherent Acoustics, which only
requires mdct_bits == 6. This relatively small size lent itself to
unrolling the loops a small number of times, and encoding offsets
calculated at assembly time within the load/store instructions of each
iteration.

In the more general case (codecs such as AAC and AC3) much larger arrays
are used - mdct_bits == [8, 9, 11]. The old method does not scale for
these cases, so more integer registers are used with non-unrolled versions
of the loops (and with some stack spillage). The postrotation filter loop
is still unrolled by a factor of 2 to permit the double-buffering of some
VFP registers to facilitate overlap of neighbouring iterations.

I benchmarked the result by measuring the number of gperftools samples
that hit anywhere in the AAC decoder (starting from aac_decode_frame())
or specifically in ff_imdct_half_c / ff_imdct_half_vfp, for the same
example AAC stream:

  Before  After
  Mean   StdDev   Mean   StdDev  Confidence  Change
aac_decode_frame  2368.1 35.8 2117.2 35.3100.0%  +11.8%
ff_imdct_half_*   457.5  22.4 251.2  16.2100.0%  +82.1%

Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5c22e8e4ad0852d61d5c4ba8d67d33fd72339497
---

 libavcodec/arm/mdct_vfp.S |  146 -
 1 file changed, 144 insertions(+), 2 deletions(-)

diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
index 94db24f..f3fe668 100644
--- a/libavcodec/arm/mdct_vfp.S
+++ b/libavcodec/arm/mdct_vfp.S
@@ -33,6 +33,11 @@ J0  .reqa2
 J1  .reqa4
 J2  .reqip
 J3  .reqlr
+REVTAB_HI .req  v5
+IN_HI   .reqv6
+OUT_HI  .reqv6
+TCOS_HI .reqsl
+TSIN_HI .reqfp
 
 .macro prerotation_innerloop
  .set trig_lo, k
@@ -76,6 +81,43 @@ J3  .reqlr
  .set k, k + 2
 .endm
 
+.macro prerotation_innerloop_rolled
+vldmia  TCOS!, {s16,s17}
+vldmdb  TCOS_HI!, {s18,s19}
+vldrs0, [IN_HI, #-4]
+vldrs1, [IN_HI, #-12]
+vldrs2, [IN, #12]
+vldrs3, [IN, #4]
+vmul.f  s8, s0, s16 @ vector operation
+vldmia  TSIN!, {s20,s21}
+vldmdb  TSIN_HI!, {s22,s23}
+vldrs4, [IN]
+vldrs5, [IN, #8]
+vldrs6, [IN_HI, #-16]
+vldrs7, [IN_HI, #-8]
+vmul.f  s12, s0, s20@ vector operation
+add IN, IN, #16
+sub IN_HI, IN_HI, #16
+ldrhJ0, [REVTAB], #2
+ldrhJ1, [REVTAB], #2
+vmls.f  s8, s4, s20 @ vector operation
+ldrhJ3, [REVTAB_HI, #-2]!
+ldrhJ2, [REVTAB_HI, #-2]!
+add J0, OUT, J0, lsl #3
+vmla.f  s12, s4, s16@ vector operation
+add J1, OUT, J1, lsl #3
+add J2, OUT, J2, lsl #3
+add J3, OUT, J3, lsl #3
+vstrs8, [J0]
+vstrs9, [J1]
+vstrs10, [J2]
+vstrs11, [J3]
+vstrs12, [J0, #4]
+vstrs13, [J1, #4]
+vstrs14, [J2, #4]
+vstrs15, [J3, #4]
+.endm
+
 .macro postrotation_innerloop tail, head
  .set trig_lo_head, n8 - k - 2
  .set trig_hi_head, n8 + k
@@ -142,6 +184,49 @@ J3  .reqlr
  .endif
 .endm
 
+.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, 
tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
+ .ifnc "\tail",""
+vmls.f  s8, s0, \tcos_s0_tail   @ vector operation
+ .endif
+ .ifnc "\head",""
+vldmia  TSIN!, {s16,s17}
+vldmdb  TSIN_HI!, {s18,s19}
+vldmia  TCOS!, {\tcos_s0_head,\tcos_s1_head}
+ .endif
+ .ifnc "\tail",""
+vmla.f  s12, s4, \tcos_s0_tail  @ vector operation
+ .endif
+ .ifnc "\head",""
+vldrs0, [OUT, #+\out_offset_head+0]
+vldrs1, [OUT, #+\out_offset_head+8]
+vldrs2, [OUT_HI, #-\out_offset_head-16]
+vldrs3, [OUT_HI, #-\out_offset_head-8]
+vldrs4, [OUT, #+\out_offset_head+4]
+vldrs5, [OUT, #+\out_offset_head+12]
+vldrs6, [OUT_HI, #-\out_offset_head-12]
+vldrs7, [OUT_HI, #-\out_offset_head-4]
+ .endif
+ .ifnc "\tail",""
+vstrs8, [OUT, #+\out_offset_tail+0]
+vstrs9, [OUT, #+\out_offset_tail+8]
+vstrs10, [OUT_HI, #-\out_offset_tail-16]
+vstrs11, [OUT_HI, #-\out_offset_tail-8]
+ .endif
+ .ifnc "\head",""
+vmul.f  s8, s4, s16 @ vector operation
+ .endif
+

[FFmpeg-cvslog] armv6: Accelerate ff_fft_calc for general case (nbits != 4)

2014-07-17 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Wed Jul 16 
16:02:01 2014 +0100| [87552d54d3337c3241e8a9e1a05df16eaa821496] | committer: 
Martin Storsjö

armv6: Accelerate ff_fft_calc for general case (nbits != 4)

The previous implementation targeted DTS Coherent Acoustics, which only
requires nbits == 4 (fft16()). This case was (and still is) linked directly
rather than being indirected through ff_fft_calc_vfp(), but now the full
range from radix-4 up to radix-65536 is available. This benefits other codecs
such as AAC and AC3.

The implementaion is based upon the C version, with each routine larger than
radix-16 calling a hierarchy of smaller FFT functions, then performing a
post-processing pass. This pass benefits a lot from loop unrolling to
counter the long pipelines in the VFP. A relaxed calling standard also
reduces the overhead of the call hierarchy, and avoiding the excessive
inlining performed by GCC probably helps with I-cache utilisation too.

I benchmarked the result by measuring the number of gperftools samples that
hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
specifically in the FFT routines (fft4() to fft512() and pass()) for the
same sample AAC stream:

  Before  After
  Mean   StdDev   Mean   StdDev  Confidence  Change
Audio decode  2245.5 53.1 1599.6 43.8100.0%  +40.4%
FFT routines  940.6  22.0 348.1  20.8100.0%  +170.2%

Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=87552d54d3337c3241e8a9e1a05df16eaa821496
---

 libavcodec/arm/fft_init_arm.c |8 +-
 libavcodec/arm/fft_vfp.S  |  264 ++---
 2 files changed, 255 insertions(+), 17 deletions(-)

diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 3a3d1a7..bc143c1 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -23,6 +23,8 @@
 #include "libavcodec/rdft.h"
 #include "libavcodec/synth_filter.h"
 
+void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
+
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
 
@@ -38,10 +40,10 @@ av_cold void ff_fft_init_arm(FFTContext *s)
 {
 int cpu_flags = av_get_cpu_flags();
 
-if (have_vfp(cpu_flags)) {
+if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
+s->fft_calc = ff_fft_calc_vfp;
 #if CONFIG_MDCT
-if (!have_vfpv3(cpu_flags))
-s->imdct_half   = ff_imdct_half_vfp;
+s->imdct_half   = ff_imdct_half_vfp;
 #endif
 }
 
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
index 7845ebb..130d529 100644
--- a/libavcodec/arm/fft_vfp.S
+++ b/libavcodec/arm/fft_vfp.S
@@ -21,8 +21,39 @@
 
 #include "libavutil/arm/asm.S"
 
-@ TODO: * FFTs wider than 16
-@   * dispatch code
+@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
+@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
+@ all single-precision VFP registers may be corrupted on exit. The a2
+@ register may not be clobbered in these functions, as it holds the
+@ stored original FPSCR.
+
+function ff_fft_calc_vfp, export=1
+ldr ip, [a1, #0]@ nbits
+mov a1, a2
+A   ldr pc, [pc, ip, lsl #2]
+A   .word   0
+A   .word   0
+A   .word   0
+T   movrel  a2, (fft_tab_vfp - 8)
+T   ldr pc, [a2, ip, lsl #2]
+T endfunc
+T const fft_tab_vfp
+.word   fft4_vfp
+.word   fft8_vfp
+.word   X(ff_fft16_vfp) @ this one alone is exported
+.word   fft32_vfp
+.word   fft64_vfp
+.word   fft128_vfp
+.word   fft256_vfp
+.word   fft512_vfp
+.word   fft1024_vfp
+.word   fft2048_vfp
+.word   fft4096_vfp
+.word   fft8192_vfp
+.word   fft16384_vfp
+.word   fft32768_vfp
+.word   fft65536_vfp
+A endfunc
 
 function fft4_vfp
 vldrd0, [a1, #0*2*4]   @ s0,s1   = z[0]
@@ -131,18 +162,22 @@ endfunc
  vstrd9, [a1, #3 * 2*4]
 .endm
 
+function .Lfft8_internal_vfp
+macro_fft8_head
+macro_fft8_tail
+bx  lr
+endfunc
+
 function fft8_vfp
 ldr a3, =0x0303 @ RunFast mode, vector length 4, stride 1
 fmrxa2, FPSCR
 fmxrFPSCR, a3
 vpush   {s16-s31}
-
-macro_fft8_head
-macro_fft8_tail
-
+mov ip, lr
+bl  .Lfft8_internal_vfp
 vpop{s16-s31}
 fmxrFPSCR, a2
-bx  lr
+bx  ip
 endfunc
 
 .align 3
@@ -153,12 +188,7 @@ cos1pi8:@ cos(1*pi/8) = sqrt(2+sqrt(2))/2
 cos3pi8:@ cos(2*pi/8) = sqrt(2-sqrt(2))/2
 .float  0.3826834261417388916015625
 
-function ff_fft16_vfp, export=1
-ldr a3, =0x0303 @ RunFast mode, vector length 4, stride 1
-fmrxa2, FPSCR
-fmxr  

[FFmpeg-cvslog] armv6: Accelerate vector_fmul_window

2014-07-17 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Fri Jul 11 
00:12:33 2014 +0100| [5edad2c4a1f46bcc56be755af86ab355c2f1b37f] | committer: 
Martin Storsjö

armv6: Accelerate vector_fmul_window

I benchmarked the result by measuring the number of gperftools samples that
hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
specifically in vector_fmul_window_c() / ff_vector_fmul_window_vfp() for the
same sample AAC stream:

Before  After
Mean   StdDev   Mean   StdDev  Confidence  Change
Audio decode1598.2 47.4 1529.2 25.4100.0%  +4.5%
vector_fmul_window  244.0  22.1 188.9  22.3100.0%  +29.2%

Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5edad2c4a1f46bcc56be755af86ab355c2f1b37f
---

 libavutil/arm/float_dsp_init_vfp.c |7 +-
 libavutil/arm/float_dsp_vfp.S  |  204 
 2 files changed, 210 insertions(+), 1 deletion(-)

diff --git a/libavutil/arm/float_dsp_init_vfp.c 
b/libavutil/arm/float_dsp_init_vfp.c
index 31cb6ae..f44020e 100644
--- a/libavutil/arm/float_dsp_init_vfp.c
+++ b/libavutil/arm/float_dsp_init_vfp.c
@@ -26,12 +26,17 @@
 void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1,
 int len);
 
+void ff_vector_fmul_window_vfp(float *dst, const float *src0,
+   const float *src1, const float *win, int len);
+
 void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
 const float *src1, int len);
 
 av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags)
 {
-if (!have_vfpv3(cpu_flags))
+if (!have_vfpv3(cpu_flags)) {
 fdsp->vector_fmul = ff_vector_fmul_vfp;
+fdsp->vector_fmul_window = ff_vector_fmul_window_vfp;
+}
 fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
 }
diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S
index 8295280..c25588f 100644
--- a/libavutil/arm/float_dsp_vfp.S
+++ b/libavutil/arm/float_dsp_vfp.S
@@ -68,6 +68,210 @@ function ff_vector_fmul_vfp, export=1
 endfunc
 
 /**
+ * ARM VFP implementation of 'vector_fmul_window_c' function
+ * Assume that len is a positive non-zero number
+ */
+@ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
+@const float *src1, const float *win, int len)
+function ff_vector_fmul_window_vfp, export=1
+DST0.reqa1
+SRC0.reqa2
+SRC1.reqa3
+WIN0.reqa4
+LEN .reqv1
+DST1.reqv2
+WIN1.reqv3
+OLDFPSCR .req   ip
+
+push{v1-v3,lr}
+ldr LEN, [sp, #4*4+0]
+vpush   {s16-s31}
+fmrxOLDFPSCR, FPSCR
+add DST1, DST0, LEN, lsl #3
+add SRC1, SRC1, LEN, lsl #2
+add WIN1, WIN0, LEN, lsl #3
+
+tst LEN, #7
+beq 4f  @ common case: len is a multiple 
of 8
+
+ldr lr, =0x0300 @ RunFast mode, scalar mode
+fmxrFPSCR, lr
+
+tst LEN, #1
+beq 1f
+vldmdb  WIN1!, {s0}
+vldmia  SRC0!, {s8}
+vldmia  WIN0!, {s16}
+vmul.f  s24, s0, s8
+vldmdb  SRC1!, {s20}
+vmul.f  s8, s16, s8
+vmls.f  s24, s16, s20
+vmla.f  s8, s0, s20
+vstmia  DST0!, {s24}
+vstmdb  DST1!, {s8}
+1:
+tst LEN, #2
+beq 2f
+vldmdb  WIN1!, {s0}
+vldmdb  WIN1!, {s1}
+vldmia  SRC0!, {s8-s9}
+vldmia  WIN0!, {s16-s17}
+vmul.f  s24, s0, s8
+vmul.f  s25, s1, s9
+vldmdb  SRC1!, {s20}
+vldmdb  SRC1!, {s21}
+vmul.f  s8, s16, s8
+vmul.f  s9, s17, s9
+vmls.f  s24, s16, s20
+vmls.f  s25, s17, s21
+vmla.f  s8, s0, s20
+vmla.f  s9, s1, s21
+vstmia  DST0!, {s24-s25}
+vstmdb  DST1!, {s8}
+vstmdb  DST1!, {s9}
+2:
+tst LEN, #4
+beq 3f
+vldmdb  WIN1!, {s0}
+vldmdb  WIN1!, {s1}
+vldmdb  WIN1!, {s2}
+vldmdb  WIN1!, {s3}
+vldmia  SRC0!, {s8-s11}
+vldmia  WIN0!, {s16-s19}
+vmul.f  s24, s0, s8
+vmul.f  s25, s1, s9
+vmul.f  s26, s2, s10
+vmul.f  s27, s3, s11
+vldmdb  SRC1!, {s20}
+vldmdb  SRC1!, {s21}
+vldmdb  SRC1!, {s22}
+vldmdb  SRC1!, {s23}
+vmul.f  s8, s16, s8
+vmul.f  s9, s17, s9
+vmul.f  s10, s18, s10
+vmul.f  s11, s19, s11
+vmls.f  s24, s16, s20
+vmls.f  s25, s17, s21
+vmls.f  s26, s18, s22
+vmls.f  s27, s19, s23
+vmla.f  s8, s0, s20
+vmla.f  s9, s1, s21
+vmla.f  s10, s2, s22
+vmla.f  s11, s3, s23
+vstmia  DST0!, {s24-s27}
+vstmdb  DST1!, {s8}
+vstmdb  DST1!, {s9}
+vstmdb  DST1!, {s10}
+ 

[FFmpeg-cvslog] armv6: Accelerate butterflies_float

2014-07-17 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Fri Jul 11 
00:12:34 2014 +0100| [5a272190a04666f0fe41be767396b30712638c21] | committer: 
Martin Storsjö

armv6: Accelerate butterflies_float

I benchmarked the result by measuring the number of gperftools samples that
hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
specifically in butterflies_float_c() / ff_butterflies_float_vfp() for the
same sample AAC stream:

   Before  After
   Mean   StdDev   Mean   StdDev  Confidence  Change
Audio decode   1542.8 43.7 1470.5 41.5100.0%  +4.9%
butterflies_float  130.0  11.9 70.2   12.1100.0%  +85.2%

Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5a272190a04666f0fe41be767396b30712638c21
---

 libavutil/arm/float_dsp_init_vfp.c |4 ++
 libavutil/arm/float_dsp_vfp.S  |  116 
 2 files changed, 120 insertions(+)

diff --git a/libavutil/arm/float_dsp_init_vfp.c 
b/libavutil/arm/float_dsp_init_vfp.c
index f44020e..61ff2ed 100644
--- a/libavutil/arm/float_dsp_init_vfp.c
+++ b/libavutil/arm/float_dsp_init_vfp.c
@@ -32,6 +32,8 @@ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
 void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
 const float *src1, int len);
 
+void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len);
+
 av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags)
 {
 if (!have_vfpv3(cpu_flags)) {
@@ -39,4 +41,6 @@ av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, 
int cpu_flags)
 fdsp->vector_fmul_window = ff_vector_fmul_window_vfp;
 }
 fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
+if (!have_vfpv3(cpu_flags))
+fdsp->butterflies_float = ff_butterflies_float_vfp;
 }
diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S
index c25588f..9f920aa 100644
--- a/libavutil/arm/float_dsp_vfp.S
+++ b/libavutil/arm/float_dsp_vfp.S
@@ -339,3 +339,119 @@ function ff_vector_fmul_reverse_vfp, export=1
 vpop{d8-d15}
 bx  lr
 endfunc
+
+/**
+ * ARM VFP implementation of 'butterflies_float_c' function
+ * Assume that len is a positive non-zero number
+ */
+@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int 
len)
+function ff_butterflies_float_vfp, export=1
+BASE1   .reqa1
+BASE2   .reqa2
+LEN .reqa3
+OLDFPSCR .req   a4
+
+vpush   {s16-s31}
+fmrxOLDFPSCR, FPSCR
+
+tst LEN, #7
+beq 4f  @ common case: len is a multiple 
of 8
+
+ldr ip, =0x0300 @ RunFast mode, scalar mode
+fmxrFPSCR, ip
+
+tst LEN, #1
+beq 1f
+vldmia  BASE1!, {s0}
+vldmia  BASE2!, {s8}
+vadd.f  s16, s0, s8
+vsub.f  s24, s0, s8
+vstrs16, [BASE1, #0-4*1]
+vstrs24, [BASE2, #0-4*1]
+1:
+tst LEN, #2
+beq 2f
+vldmia  BASE1!, {s0-s1}
+vldmia  BASE2!, {s8-s9}
+vadd.f  s16, s0, s8
+vadd.f  s17, s1, s9
+vsub.f  s24, s0, s8
+vsub.f  s25, s1, s9
+vstrd8, [BASE1, #0-8*1]@ s16,s17
+vstrd12, [BASE2, #0-8*1]   @ s24,s25
+2:
+tst LEN, #4
+beq 3f
+vldmia  BASE1!, {s0-s1}
+vldmia  BASE2!, {s8-s9}
+vldmia  BASE1!, {s2-s3}
+vldmia  BASE2!, {s10-s11}
+vadd.f  s16, s0, s8
+vadd.f  s17, s1, s9
+vsub.f  s24, s0, s8
+vsub.f  s25, s1, s9
+vadd.f  s18, s2, s10
+vadd.f  s19, s3, s11
+vsub.f  s26, s2, s10
+vsub.f  s27, s3, s11
+vstrd8, [BASE1, #0-16*1]@ s16,s17
+vstrd12, [BASE2, #0-16*1]   @ s24,s25
+vstrd9, [BASE1, #8-16*1]@ s18,s19
+vstrd13, [BASE2, #8-16*1]   @ s26,s27
+3:
+bicsLEN, LEN, #7
+beq 7f
+4:
+ldr ip, =0x0303 @ RunFast mode, short vectors of 
length 4, stride 1
+fmxrFPSCR, ip
+
+vldmia  BASE1!, {s0-s1}
+vldmia  BASE2!, {s8-s9}
+vldmia  BASE1!, {s2-s3}
+vldmia  BASE2!, {s10-s11}
+vadd.f  s16, s0, s8
+vldmia  BASE1!, {s4-s5}
+vldmia  BASE2!, {s12-s13}
+vldmia  BASE1!, {s6-s7}
+vldmia  BASE2!, {s14-s15}
+vsub.f  s24, s0, s8
+vadd.f  s20, s4, s12
+subsLEN, LEN, #8
+beq 6f
+5:  vldmia  BASE1!, {s0-s3}
+vldmia  BASE2!, {s8-s11}
+vsub.f  s28, s4, s12
+vstrd8, [BASE1, #0-16*3]@ s16,s17
+vstrd9, [BASE1, #8-16*3]@ s18,s19
+vstrd12, [BASE2, #0-16*3]   @ s24,s25
+vstrd13, [BASE2, #8-16*3]   @ s26,s27
+ 

[FFmpeg-cvslog] arm: Macroize the test for 'setend' CPU instruction support

2014-07-22 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Mon Jul 21 
14:53:06 2014 +0100| [6869612f5c7d4d2f20f69a5658328a761deadb1c] | committer: 
Diego Biurrun

arm: Macroize the test for 'setend' CPU instruction support

Signed-off-by: Diego Biurrun 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=6869612f5c7d4d2f20f69a5658328a761deadb1c
---

 libavcodec/arm/h264dsp_init_arm.c |6 +-
 libavutil/arm/cpu.h   |6 ++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/libavcodec/arm/h264dsp_init_arm.c 
b/libavcodec/arm/h264dsp_init_arm.c
index f9712d8..7cb1312 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -104,12 +104,8 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const 
int bit_depth,
 {
 int cpu_flags = av_get_cpu_flags();
 
-if (have_armv6(cpu_flags) && !(have_vfpv3(cpu_flags) || 
have_neon(cpu_flags))) {
-// This function uses the 'setend' instruction which is deprecated
-// on ARMv8. This instruction is serializing on some ARMv7 cores as
-// well. Therefore, only use the function on ARMv6.
+if (have_setend(cpu_flags))
 c->h264_find_start_code_candidate = 
ff_h264_find_start_code_candidate_armv6;
-}
 if (have_neon(cpu_flags))
 h264dsp_init_neon(c, bit_depth, chroma_format_idc);
 }
diff --git a/libavutil/arm/cpu.h b/libavutil/arm/cpu.h
index 52e839c..224409a 100644
--- a/libavutil/arm/cpu.h
+++ b/libavutil/arm/cpu.h
@@ -30,4 +30,10 @@
 #define have_vfpv3(flags)   CPUEXT(flags, VFPV3)
 #define have_neon(flags)CPUEXT(flags, NEON)
 
+/* Some functions use the 'setend' instruction which is deprecated on ARMv8
+ * and serializing on some ARMv7 cores. This macro ensures such functions
+ * are only enabled on ARMv6. */
+#define have_setend(flags)  \
+(have_armv6(flags) && !(have_vfpv3(flags) || have_neon(flags)))
+
 #endif /* AVUTIL_ARM_CPU_H */

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog


[FFmpeg-cvslog] h264: Move start code search functions into separate source files.

2014-08-05 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Mon Jul 21 
16:25:48 2014 +0100| [db7f1c7c5a1d37e7f4da64a79a97bea1c4b6e9f8] | committer: 
Luca Barbato

h264: Move start code search functions into separate source files.

This permits re-use with parsers for codecs which use similar start codes.

Signed-off-by: Luca Barbato 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=db7f1c7c5a1d37e7f4da64a79a97bea1c4b6e9f8
---

 configure  |3 +-
 libavcodec/Makefile|1 +
 libavcodec/arm/Makefile|2 +-
 libavcodec/arm/h264dsp_init_arm.c  |5 +-
 libavcodec/arm/startcode.h |   26 +
 .../arm/{h264dsp_armv6.S => startcode_armv6.S} |4 +-
 libavcodec/h264_parser.c   |2 +-
 libavcodec/h264dsp.c   |   31 +--
 libavcodec/h264dsp.h   |2 +-
 libavcodec/startcode.c |   57 
 libavcodec/startcode.h |   26 +
 11 files changed, 121 insertions(+), 38 deletions(-)

diff --git a/configure b/configure
index b9242e2..4fc1e6a 100755
--- a/configure
+++ b/configure
@@ -1578,6 +1578,7 @@ CONFIG_EXTRA="
 rtpdec
 rtpenc_chain
 sinewin
+startcode
 tpeldsp
 videodsp
 vp3dsp
@@ -1794,7 +1795,7 @@ h263_decoder_select="error_resilience h263_parser h263dsp 
mpeg_er mpegvideo qpel
 h263_encoder_select="aandcttables h263dsp mpegvideoenc"
 h263i_decoder_select="h263_decoder"
 h263p_encoder_select="h263_encoder"
-h264_decoder_select="cabac golomb h264chroma h264dsp h264pred h264qpel 
videodsp"
+h264_decoder_select="cabac golomb h264chroma h264dsp h264pred h264qpel 
startcode videodsp"
 h264_decoder_suggest="error_resilience"
 hevc_decoder_select="bswapdsp cabac golomb videodsp"
 huffyuv_decoder_select="bswapdsp huffyuvdsp"
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index a088a68..7d19e6e 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -79,6 +79,7 @@ OBJS-$(CONFIG_RANGECODER)  += rangecoder.o
 RDFT-OBJS-$(CONFIG_HARDCODED_TABLES)   += sin_tables.o
 OBJS-$(CONFIG_RDFT)+= rdft.o $(RDFT-OBJS-yes)
 OBJS-$(CONFIG_SINEWIN) += sinewin.o
+OBJS-$(CONFIG_STARTCODE)   += startcode.o
 OBJS-$(CONFIG_TPELDSP) += tpeldsp.o
 OBJS-$(CONFIG_VAAPI)   += vaapi.o
 OBJS-$(CONFIG_VDA) += vda.o
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 742c3ee..6c2eb99 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -53,7 +53,6 @@ ARMV5TE-OBJS-$(CONFIG_VIDEODSP)+= 
arm/videodsp_init_armv5te.o   \
 ARMV5TE-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv5te.o
 
 ARMV6-OBJS-$(CONFIG_AC3DSP)+= arm/ac3dsp_armv6.o
-ARMV6-OBJS-$(CONFIG_H264DSP)   += arm/h264dsp_armv6.o
 ARMV6-OBJS-$(CONFIG_HPELDSP)   += arm/hpeldsp_init_armv6.o  \
   arm/hpeldsp_armv6.o
 ARMV6-OBJS-$(CONFIG_IDCTDSP)   += arm/idctdsp_init_armv6.o  \
@@ -65,6 +64,7 @@ ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC)  += 
arm/mpegvideoencdsp_armv6.o
 ARMV6-OBJS-$(CONFIG_PIXBLOCKDSP)   += arm/pixblockdsp_armv6.o
 
 ARMV6-OBJS-$(CONFIG_MLP_DECODER)   += arm/mlpdsp_armv6.o
+ARMV6-OBJS-$(CONFIG_STARTCODE) += arm/startcode_armv6.o
 ARMV6-OBJS-$(CONFIG_VP7_DECODER)   += arm/vp8_armv6.o   \
   arm/vp8dsp_init_armv6.o   \
   arm/vp8dsp_armv6.o
diff --git a/libavcodec/arm/h264dsp_init_arm.c 
b/libavcodec/arm/h264dsp_init_arm.c
index 7cb1312..7afd350 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -23,8 +23,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/h264dsp.h"
-
-int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size);
+#include "libavcodec/arm/startcode.h"
 
 void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
  int beta, int8_t *tc0);
@@ -105,7 +104,7 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const 
int bit_depth,
 int cpu_flags = av_get_cpu_flags();
 
 if (have_setend(cpu_flags))
-c->h264_find_start_code_candidate = 
ff_h264_find_start_code_candidate_armv6;
+c->startcode_find_candidate = ff_startcode_find_candidate_armv6;
 if (have_neon(cpu_flags))
 h264dsp_init_neon(c, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/arm/startcode.h b/libavcodec/arm/startcode.h
new file mode 100644
index 000..d7996c1
--- /dev/null
+++ b/

[FFmpeg-cvslog] vc-1: Add platform-specific start code search routine to VC1DSPContext.

2014-08-05 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Mon Jul 21 
14:53:08 2014 +0100| [adf8227cf4e7b4fccb2ad88e1e09b6dc00dd00ed] | committer: 
Luca Barbato

vc-1: Add platform-specific start code search routine to VC1DSPContext.

Initialise VC1DSPContext for parser as well as for decoder.
Note, the VC-1 code doesn't actually use the function pointer yet.

Signed-off-by: Luca Barbato 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=adf8227cf4e7b4fccb2ad88e1e09b6dc00dd00ed
---

 configure|4 ++--
 libavcodec/Makefile  |2 +-
 libavcodec/arm/vc1dsp_init_arm.c |3 +++
 libavcodec/vc1.c |2 ++
 libavcodec/vc1dec.c  |1 -
 libavcodec/vc1dsp.c  |3 +++
 libavcodec/vc1dsp.h  |8 
 7 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/configure b/configure
index 4fc1e6a..b2eb0c8 100755
--- a/configure
+++ b/configure
@@ -1885,7 +1885,7 @@ twinvq_decoder_select="mdct lsp sinewin"
 utvideo_decoder_select="bswapdsp"
 utvideo_encoder_select="bswapdsp huffman huffyuvencdsp"
 vble_decoder_select="huffyuvdsp"
-vc1_decoder_select="blockdsp error_resilience h263_decoder h264chroma h264qpel 
intrax8 mpeg_er qpeldsp"
+vc1_decoder_select="blockdsp error_resilience h263_decoder h264chroma h264qpel 
intrax8 mpeg_er qpeldsp startcode"
 vc1image_decoder_select="vc1_decoder"
 vorbis_decoder_select="mdct"
 vorbis_encoder_select="mdct"
@@ -1963,7 +1963,7 @@ wmv3_vdpau_hwaccel_select="vc1_vdpau_hwaccel"
 h264_parser_select="h264_decoder"
 mpegvideo_parser_select="mpegvideo"
 mpeg4video_parser_select="error_resilience h263dsp mpeg_er mpegvideo qpeldsp"
-vc1_parser_select="mpegvideo"
+vc1_parser_select="mpegvideo startcode"
 
 # external libraries
 libfaac_encoder_deps="libfaac"
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 7d19e6e..d59bd1c 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -674,7 +674,7 @@ OBJS-$(CONFIG_PNM_PARSER)  += pnm_parser.o pnm.o
 OBJS-$(CONFIG_RV30_PARSER) += rv34_parser.o
 OBJS-$(CONFIG_RV40_PARSER) += rv34_parser.o
 OBJS-$(CONFIG_TAK_PARSER)  += tak_parser.o tak.o
-OBJS-$(CONFIG_VC1_PARSER)  += vc1_parser.o vc1.o vc1data.o \
+OBJS-$(CONFIG_VC1_PARSER)  += vc1_parser.o vc1.o vc1data.o 
vc1dsp.o \
   msmpeg4.o msmpeg4data.o mpeg4video.o 
\
   h263.o
 OBJS-$(CONFIG_VORBIS_PARSER)   += vorbis_parser.o xiph.o
diff --git a/libavcodec/arm/vc1dsp_init_arm.c b/libavcodec/arm/vc1dsp_init_arm.c
index 6d4eb79..a6a97c8 100644
--- a/libavcodec/arm/vc1dsp_init_arm.c
+++ b/libavcodec/arm/vc1dsp_init_arm.c
@@ -20,6 +20,7 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/arm/cpu.h"
+#include "libavcodec/arm/startcode.h"
 #include "libavcodec/vc1dsp.h"
 #include "vc1dsp.h"
 
@@ -27,6 +28,8 @@ av_cold void ff_vc1dsp_init_arm(VC1DSPContext *dsp)
 {
 int cpu_flags = av_get_cpu_flags();
 
+if (have_setend(cpu_flags))
+dsp->startcode_find_candidate = ff_startcode_find_candidate_armv6;
 if (have_neon(cpu_flags))
 ff_vc1dsp_init_neon(dsp);
 }
diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c
index 1978b08..cef0fe6 100644
--- a/libavcodec/vc1.c
+++ b/libavcodec/vc1.c
@@ -1688,5 +1688,7 @@ av_cold int ff_vc1_init_common(VC1Context *v)
 v->pq  = -1;
 v->mvrange = 0; /* 7.1.1.18, p80 */
 
+ff_vc1dsp_init(&v->vc1dsp);
+
 return 0;
 }
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index c83bb4f..f7f6a9f 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -5629,7 +5629,6 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
 ff_blockdsp_init(&s->bdsp, avctx);
 ff_h264chroma_init(&v->h264chroma, 8);
 ff_qpeldsp_init(&s->qdsp);
-ff_vc1dsp_init(&v->vc1dsp);
 
 if (avctx->codec_id == AV_CODEC_ID_WMV3 || avctx->codec_id == 
AV_CODEC_ID_WMV3IMAGE) {
 int count = 0;
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index 3b92eb2..a193dd7 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -29,6 +29,7 @@
 #include "h264chroma.h"
 #include "qpeldsp.h"
 #include "vc1dsp.h"
+#include "startcode.h"
 
 /* Apply overlap transform to horizontal edge */
 static void vc1_v_overlap_c(uint8_t *src, int stride)
@@ -948,6 +949,8 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
 dsp->sprite_v_double_twoscale = sprite_v_double_twoscale_c;
 #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
 
+dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
+
 if (ARCH_AARCH64)
 ff_vc1dsp

[FFmpeg-cvslog] vc-1: Optimise parser (with special attention to ARM)

2014-08-05 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Mon Jul 21 
14:53:09 2014 +0100| [701e8b42e12ad625c64ceae2252acb1de390278c] | committer: 
Luca Barbato

vc-1: Optimise parser (with special attention to ARM)

The previous implementation of the parser made four passes over each input
buffer (reduced to two if the container format already guaranteed the input
buffer corresponded to frames, such as with MKV). But these buffers are
often 200K in size, certainly enough to flush the data out of L1 cache, and
for many CPUs, all the way out to main memory. The passes were:

1) locate frame boundaries (not needed for MKV etc)
2) copy the data into a contiguous block (not needed for MKV etc)
3) locate the start codes within each frame
4) unescape the data between start codes

After this, the unescaped data was parsed to extract certain header fields,
but because the unescape operation was so large, this was usually also
effectively operating on uncached memory. Most of the unescaped data was
simply thrown away and never processed further. Only step 2 - because it
used memcpy - was using prefetch, making things even worse.

This patch reorganises these steps so that, aside from the copying, the
operations are performed in parallel, maximising cache utilisation. No more
than the worst-case number of bytes needed for header parsing is unescaped.
Most of the data is, in practice, only read in order to search for a start
code, for which optimised implementations already existed in the H264 codec
(notably the ARM version uses prefetch, so we end up doing both remaining
passes at maximum speed). For MKV files, we know when we've found the last
start code of interest in a given frame, so we are able to avoid doing even
that one remaining pass for most of the buffer.

In some use-cases (such as the Raspberry Pi) video decode is handled by the
GPU, but the entire elementary stream is still fed through the parser to
pick out certain elements of the header which are necessary to manage the
decode process. As you might expect, in these cases, the performance of the
parser is significant.

To measure parser performance, I used the same VC-1 elementary stream in
either an MPEG-2 transport stream or a MKV file, and fed it through avconv
with -c:v copy -c:a copy -f null. These are the gperftools counts for
those streams, both filtered to only include vc1_parse() and its callees,
and unfiltered (to include the whole binary). Lower numbers are better:

Before  After
File  Filtered  Mean   StdDev   Mean   StdDev  Confidence  Change
M2TS  No861.7  8.2  650.5  8.1 100.0%  +32.5%
MKV   No868.9  7.4  731.7  9.0 100.0%  +18.8%
M2TS  Yes   250.0  11.2 27.2   3.4 100.0%  +817.9%
MKV   Yes   149.0  12.8 1.70.8 100.0%  +8526.3%

Yes, that last case shows vc1_parse() running 86 times faster! The M2TS
case does show a larger absolute improvement though, since it was worse
to begin with.

This patch has been tested with the FATE suite (albeit on x86 for speed).

Signed-off-by: Luca Barbato 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=701e8b42e12ad625c64ceae2252acb1de390278c
---

 libavcodec/vc1_parser.c |  276 ++-
 1 file changed, 175 insertions(+), 101 deletions(-)

diff --git a/libavcodec/vc1_parser.c b/libavcodec/vc1_parser.c
index 1bedd98..43ca0ed 100644
--- a/libavcodec/vc1_parser.c
+++ b/libavcodec/vc1_parser.c
@@ -30,117 +30,84 @@
 #include "vc1.h"
 #include "get_bits.h"
 
+/** The maximum number of bytes of a sequence, entry point or
+ *  frame header whose values we pay any attention to */
+#define UNESCAPED_THRESHOLD 37
+
+/** The maximum number of bytes of a sequence, entry point or
+ *  frame header which must be valid memory (because they are
+ *  used to update the bitstream cache in skip_bits() calls)
+ */
+#define UNESCAPED_LIMIT 144
+
+typedef enum {
+NO_MATCH,
+ONE_ZERO,
+TWO_ZEROS,
+ONE
+} VC1ParseSearchState;
+
 typedef struct {
 ParseContext pc;
 VC1Context v;
+uint8_t prev_start_code;
+size_t bytes_to_skip;
+uint8_t unesc_buffer[UNESCAPED_LIMIT];
+size_t unesc_index;
+VC1ParseSearchState search_state;
 } VC1ParseContext;
 
-static void vc1_extract_headers(AVCodecParserContext *s, AVCodecContext *avctx,
-const uint8_t *buf, int buf_size)
+static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
+   const uint8_t *buf, int buf_size)
 {
+/* Parse the header we just finished unescaping */
 VC1ParseContext *vpc = s->priv_data;
 GetBitContext gb;
-const uint8_t *start, *end, *next;
-uint8_t *buf2 = av_mallocz(buf_size + FF_INPUT_BUFFER_PADDING_SIZE);
-
 vpc->v.s.avctx = avctx;
 vpc->v.parse_only = 1;
-next = buf;
-s->repeat_pict = 0;
-
-for(start = buf, end = buf + buf_size;

[FFmpeg-cvslog] checkasm: Add vc1dsp in-loop deblocking filter tests

2022-04-01 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Thu Mar 31 
18:23:42 2022 +0100| [20cb43ea8ba0471dcba442b8de8fa17ff41f6281] | committer: 
Martin Storsjö

checkasm: Add vc1dsp in-loop deblocking filter tests

Note that the benchmarking results for these functions are highly dependent
upon the input data. Therefore, each function is benchmarked twice,
corresponding to the best and worst case complexity of the reference C
implementation. The performance of a real stream decode will fall somewhere
between these two extremes.

Signed-off-by: Ben Avison 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=20cb43ea8ba0471dcba442b8de8fa17ff41f6281
---

 tests/checkasm/Makefile   |   1 +
 tests/checkasm/checkasm.c |   3 ++
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/vc1dsp.c   | 102 ++
 tests/fate/checkasm.mak   |   1 +
 5 files changed, 108 insertions(+)

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index f768b1144e..7133a6ee66 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -11,6 +11,7 @@ AVCODECOBJS-$(CONFIG_H264PRED)  += h264pred.o
 AVCODECOBJS-$(CONFIG_H264QPEL)  += h264qpel.o
 AVCODECOBJS-$(CONFIG_LLVIDDSP)  += llviddsp.o
 AVCODECOBJS-$(CONFIG_LLVIDENCDSP)   += llviddspenc.o
+AVCODECOBJS-$(CONFIG_VC1DSP)+= vc1dsp.o
 AVCODECOBJS-$(CONFIG_VP8DSP)+= vp8dsp.o
 AVCODECOBJS-$(CONFIG_VIDEODSP)  += videodsp.o
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 748d6a9f3a..c2efd81b6d 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -147,6 +147,9 @@ static const struct {
 #if CONFIG_V210_ENCODER
 { "v210enc", checkasm_check_v210enc },
 #endif
+#if CONFIG_VC1DSP
+{ "vc1dsp", checkasm_check_vc1dsp },
+#endif
 #if CONFIG_VP8DSP
 { "vp8dsp", checkasm_check_vp8dsp },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index c3192d8c23..52ab18a5b1 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -78,6 +78,7 @@ void checkasm_check_sw_scale(void);
 void checkasm_check_utvideodsp(void);
 void checkasm_check_v210dec(void);
 void checkasm_check_v210enc(void);
+void checkasm_check_vc1dsp(void);
 void checkasm_check_vf_eq(void);
 void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c
new file mode 100644
index 00..2fd6c74d6c
--- /dev/null
+++ b/tests/checkasm/vc1dsp.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2022 Ben Avison
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "checkasm.h"
+
+#include "libavcodec/vc1dsp.h"
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
+
+typedef struct {
+const char *name;
+size_t offset;
+} test;
+
+#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size)  \
+do {\
+uint8_t *p##0 = name##0, *p##1 = name##1;   \
+int i = (size); \
+while (i-- > 0) {   \
+int x = 0x80 | (rnd() & 0x7F);  \
+x >>= rnd() % 9;\
+if (rnd() & 1)  \
+x = -x; \
+*p##1++ = *p##0++ = 0x80 + x;   \
+}   \
+} while (0)
+
+static void check_loop_filter(void)
+{
+/* Deblocking filter buffers are big enough to hold a 16x16 block,
+ * plus 16 columns left and 4 rows above to hold filter inputs
+ * (depending on whether v or h neighbouring block edge, oversized
+ * horizontally to maintain 16-byte alignment) plus 16 columns and
+ * 4 rows below to catch write overflows */
+LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
+LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
+
+  

[FFmpeg-cvslog] checkasm: Add vc1dsp inverse transform tests

2022-04-01 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Thu Mar 31 
18:23:43 2022 +0100| [2698bfdc93d456d304a38b570052e1a238d64c54] | committer: 
Martin Storsjö

checkasm: Add vc1dsp inverse transform tests

This test deliberately doesn't exercise the full range of inputs described in
the committee draft VC-1 standard. It says:

input coefficients in frequency domain, D, satisfy   -2048 <= D < 2047
intermediate coefficients, E, satisfy-4096 <= E < 4095
fully inverse-transformed coefficients, R, satisfy-512 <= R <  511

For one thing, the inequalities look odd. Did they mean them to go the
other way round? That would make more sense because the equations generally
both add and subtract coefficients multiplied by constants, including powers
of 2. Requiring the most-negative values to be valid extends the number of
bits to represent the intermediate values just for the sake of that one case!

For another thing, the extreme values don't look to occur in real streams -
both in my experience and supported by the following comment in the AArch32
decoder:

tNhalf is half of the value of tN (as described in vc1_inv_trans_8x8_c).
This is done because sometimes files have input that causes tN + tM to
overflow. To avoid this overflow, we compute tNhalf, then compute
tNhalf + tM (which doesn't overflow), and then we use vhadd to compute
(tNhalf + (tNhalf + tM)) >> 1 which does not overflow because it is
one instruction.

My AArch64 decoder goes further than this. It calculates tNhalf and tM
then does an SRA (essentially a fused halve and add) to compute
(tN + tM) >> 1 without ever having to hold (tNhalf + tM) in a 16-bit element
without overflowing. It only encounters difficulties if either tNhalf or
tM overflow in isolation.

I haven't had sight of the final standard, so it's possible that these
issues were dealt with during finalisation, which could explain the lack
of usage of extreme inputs in real streams. Or a preponderance of decoders
that only support 16-bit intermediate values in their inverse transforms
might have caused encoders to steer clear of such cases.

I have effectively followed this approach in the test, and limited the
scale of the coefficients sufficient that both the existing AArch32 decoder
and my new AArch64 decoder both pass.

Signed-off-by: Ben Avison 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2698bfdc93d456d304a38b570052e1a238d64c54
---

 tests/checkasm/vc1dsp.c | 283 
 1 file changed, 283 insertions(+)

diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c
index 2fd6c74d6c..7d4457306f 100644
--- a/tests/checkasm/vc1dsp.c
+++ b/tests/checkasm/vc1dsp.c
@@ -30,12 +30,208 @@
 #include "libavutil/mem_internal.h"
 
 #define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
+#define VC1DSP_SIZED_TEST(func, width, height) { #func, 
offsetof(VC1DSPContext, func), width, height },
 
 typedef struct {
 const char *name;
 size_t offset;
+int width;
+int height;
 } test;
 
+typedef struct matrix {
+size_t width;
+size_t height;
+float d[];
+} matrix;
+
+static const matrix T8 = { 8, 8, {
+12,  12,  12,  12,  12,  12,  12,  12,
+16,  15,   9,   4,  -4,  -9, -15, -16,
+16,   6,  -6, -16, -16,  -6,   6,  16,
+15,  -4, -16,  -9,   9,  16,   4, -15,
+12, -12, -12,  12,  12, -12, -12,  12,
+ 9, -16,   4,  15, -15,  -4,  16,  -9,
+ 6, -16,  16,  -6,  -6,  16, -16,   6,
+ 4,  -9,  15, -16,  16, -15,   9,  -4
+} };
+
+static const matrix T4 = { 4, 4, {
+17,  17,  17,  17,
+22,  10, -10, -22,
+17, -17, -17,  17,
+10, -22,  22, -10
+} };
+
+static const matrix T8t = { 8, 8, {
+12,  16,  16,  15,  12,   9,   6,   4,
+12,  15,   6,  -4, -12, -16, -16,  -9,
+12,   9,  -6, -16, -12,   4,  16,  15,
+12,   4, -16,  -9,  12,  15,  -6, -16,
+12,  -4, -16,   9,  12, -15,  -6,  16,
+12,  -9,  -6,  16, -12,  -4,  16, -15,
+12, -15,   6,   4, -12,  16, -16,   9,
+12, -16,  16, -15,  12,  -9,   6,  -4
+} };
+
+static const matrix T4t = { 4, 4, {
+17,  22,  17,  10,
+17,  10, -17, -22,
+17, -10, -17,  22,
+17, -22,  17, -10
+} };
+
+static matrix *new_matrix(size_t width, size_t height)
+{
+matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof 
(float));
+if (out == NULL) {
+fprintf(stderr, "Memory allocation failure\n");
+exit(EXIT_FAILURE);
+}
+out->width = width;
+out->height = height;
+return out;
+}
+
+static matrix *multiply(const matrix *a, const matrix *b)
+{
+matrix *out;
+if (a->width != b->height) {
+fprintf(stderr, "Incompatible multiplication\n");
+exit(EXIT_FAILURE);
+}
+o

[FFmpeg-cvslog] checkasm: Add idctdsp add/put-pixels-clamped tests

2022-04-01 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Thu Mar 31 
18:23:44 2022 +0100| [bd3615a81a3387cafb51444927e852423f8f4a6e] | committer: 
Martin Storsjö

checkasm: Add idctdsp add/put-pixels-clamped tests

Signed-off-by: Ben Avison 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bd3615a81a3387cafb51444927e852423f8f4a6e
---

 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/idctdsp.c  | 98 +++
 tests/fate/checkasm.mak   |  1 +
 5 files changed, 104 insertions(+)

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 7133a6ee66..f6b1008855 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -9,6 +9,7 @@ AVCODECOBJS-$(CONFIG_G722DSP)   += g722dsp.o
 AVCODECOBJS-$(CONFIG_H264DSP)   += h264dsp.o
 AVCODECOBJS-$(CONFIG_H264PRED)  += h264pred.o
 AVCODECOBJS-$(CONFIG_H264QPEL)  += h264qpel.o
+AVCODECOBJS-$(CONFIG_IDCTDSP)   += idctdsp.o
 AVCODECOBJS-$(CONFIG_LLVIDDSP)  += llviddsp.o
 AVCODECOBJS-$(CONFIG_LLVIDENCDSP)   += llviddspenc.o
 AVCODECOBJS-$(CONFIG_VC1DSP)+= vc1dsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index c2efd81b6d..57134f96ea 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -123,6 +123,9 @@ static const struct {
 #if CONFIG_HUFFYUV_DECODER
 { "huffyuvdsp", checkasm_check_huffyuvdsp },
 #endif
+#if CONFIG_IDCTDSP
+{ "idctdsp", checkasm_check_idctdsp },
+#endif
 #if CONFIG_JPEG2000_DECODER
 { "jpeg2000dsp", checkasm_check_jpeg2000dsp },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 52ab18a5b1..a86db140e3 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -64,6 +64,7 @@ void checkasm_check_hevc_idct(void);
 void checkasm_check_hevc_pel(void);
 void checkasm_check_hevc_sao(void);
 void checkasm_check_huffyuvdsp(void);
+void checkasm_check_idctdsp(void);
 void checkasm_check_jpeg2000dsp(void);
 void checkasm_check_llviddsp(void);
 void checkasm_check_llviddspenc(void);
diff --git a/tests/checkasm/idctdsp.c b/tests/checkasm/idctdsp.c
new file mode 100644
index 00..02724536a7
--- /dev/null
+++ b/tests/checkasm/idctdsp.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022 Ben Avison
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "checkasm.h"
+
+#include "libavcodec/idctdsp.h"
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) },
+
+typedef struct {
+const char *name;
+size_t offset;
+} test;
+
+#define RANDOMIZE_BUFFER16(name, size)  \
+do {\
+int i;  \
+for (i = 0; i < size; ++i) {\
+uint16_t r = rnd() % 0x201 - 0x100; \
+AV_WN16A(name##0 + i, r);   \
+AV_WN16A(name##1 + i, r);   \
+}   \
+} while (0)
+
+#define RANDOMIZE_BUFFER8(name, size) \
+do {  \
+int i;\
+for (i = 0; i < size; ++i) {  \
+uint8_t r = rnd();\
+name##0[i] = r;   \
+name##1[i] = r;   \
+} \
+} while (0)
+
+static void check_add_put_clamped(void)
+{
+/* Source buffers are only as big as needed, since any over-read won't 
affect results */
+LOCAL_ALIGNED_16(int16_t, src0, [64]);
+LOCAL_ALIGNED_16(int16_t, src1, [64]);
+/* Destination buffers have borders of one row above/below and 8 columns 
left/right to catch overflows */
+LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]);
+LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]);
+
+AVCodecContext avctx = { 0 };
+IDCTDSPContext h;
+
+c

[FFmpeg-cvslog] avcodec/vc1: Introduce fast path for unescaping bitstream buffer

2022-04-01 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Thu Mar 31 
18:23:45 2022 +0100| [2e268477802d64aa75b9c3c2cb2fc89d1ef7c87d] | committer: 
Martin Storsjö

avcodec/vc1: Introduce fast path for unescaping bitstream buffer

Includes a checkasm test.

Signed-off-by: Ben Avison 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2e268477802d64aa75b9c3c2cb2fc89d1ef7c87d
---

 libavcodec/vc1dec.c | 20 +++
 libavcodec/vc1dsp.c |  2 ++
 libavcodec/vc1dsp.h |  3 +++
 tests/checkasm/vc1dsp.c | 67 +
 4 files changed, 82 insertions(+), 10 deletions(-)

diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index e279ffd1c1..0426e8a752 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -491,7 +491,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
 size = next - start - 4;
 if (size <= 0)
 continue;
-buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
+buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
 init_get_bits(&gb, buf2, buf2_size * 8);
 switch (AV_RB32(start)) {
 case VC1_CODE_SEQHDR:
@@ -681,7 +681,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void 
*data,
 case VC1_CODE_FRAME:
 if (avctx->hwaccel)
 buf_start = start;
-buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
+buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, 
buf2);
 break;
 case VC1_CODE_FIELD: {
 int buf_size3;
@@ -698,8 +698,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void 
*data,
 ret = AVERROR(ENOMEM);
 goto err;
 }
-buf_size3 = vc1_unescape_buffer(start + 4, size,
-slices[n_slices].buf);
+buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
+  
slices[n_slices].buf);
 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
   buf_size3 << 3);
 slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
@@ -710,7 +710,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void 
*data,
 break;
 }
 case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
-buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
+buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, 
buf2);
 init_get_bits(&s->gb, buf2, buf_size2 * 8);
 ff_vc1_decode_entry_point(avctx, v, &s->gb);
 break;
@@ -727,8 +727,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void 
*data,
 ret = AVERROR(ENOMEM);
 goto err;
 }
-buf_size3 = vc1_unescape_buffer(start + 4, size,
-slices[n_slices].buf);
+buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
+  
slices[n_slices].buf);
 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
   buf_size3 << 3);
 slices[n_slices].mby_start = 
get_bits(&slices[n_slices].gb, 9);
@@ -762,7 +762,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void 
*data,
 ret = AVERROR(ENOMEM);
 goto err;
 }
-buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - 
divider - 4, slices[n_slices].buf);
+buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + 
buf_size - divider - 4, slices[n_slices].buf);
 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
   buf_size3 << 3);
 slices[n_slices].mby_start = s->mb_height + 1 >> 1;
@@ -771,9 +771,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void 
*data,
 n_slices1 = n_slices - 1;
 n_slices++;
 }
-buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
+buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, 
buf2);
 } else {
-buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
+buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
 }
 init_get_bits(&s->gb, buf2, buf_size2*8);
 } else{
diff --git a/libavcodec

[FFmpeg-cvslog] avcodec/vc1: Arm 64-bit NEON deblocking filter fast paths

2022-04-01 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Thu Mar 31 
18:23:46 2022 +0100| [c62bbd4d2015ffa717369e687601fb2d481af6b0] | committer: 
Martin Storsjö

avcodec/vc1: Arm 64-bit NEON deblocking filter fast paths

checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. Note that the C
version can still outperform the NEON version in specific cases. The balance
between different code paths is stream-dependent, but in practice the best
case happens about 5% of the time, the worst case happens about 40% of the
time, and the complexity of the remaining cases fall somewhere in between.
Therefore, taking the average of the best and worst case timings is
probably a conservative estimate of the degree by which the NEON code
improves performance.

vc1dsp.vc1_h_loop_filter4_bestcase_c: 10.7
vc1dsp.vc1_h_loop_filter4_bestcase_neon: 43.5
vc1dsp.vc1_h_loop_filter4_worstcase_c: 184.5
vc1dsp.vc1_h_loop_filter4_worstcase_neon: 73.7
vc1dsp.vc1_h_loop_filter8_bestcase_c: 31.2
vc1dsp.vc1_h_loop_filter8_bestcase_neon: 62.2
vc1dsp.vc1_h_loop_filter8_worstcase_c: 358.2
vc1dsp.vc1_h_loop_filter8_worstcase_neon: 88.2
vc1dsp.vc1_h_loop_filter16_bestcase_c: 51.0
vc1dsp.vc1_h_loop_filter16_bestcase_neon: 107.7
vc1dsp.vc1_h_loop_filter16_worstcase_c: 722.7
vc1dsp.vc1_h_loop_filter16_worstcase_neon: 140.5
vc1dsp.vc1_v_loop_filter4_bestcase_c: 9.7
vc1dsp.vc1_v_loop_filter4_bestcase_neon: 43.0
vc1dsp.vc1_v_loop_filter4_worstcase_c: 178.7
vc1dsp.vc1_v_loop_filter4_worstcase_neon: 69.0
vc1dsp.vc1_v_loop_filter8_bestcase_c: 30.2
vc1dsp.vc1_v_loop_filter8_bestcase_neon: 50.7
vc1dsp.vc1_v_loop_filter8_worstcase_c: 353.0
vc1dsp.vc1_v_loop_filter8_worstcase_neon: 69.2
vc1dsp.vc1_v_loop_filter16_bestcase_c: 60.0
vc1dsp.vc1_v_loop_filter16_bestcase_neon: 90.0
vc1dsp.vc1_v_loop_filter16_worstcase_c: 714.2
vc1dsp.vc1_v_loop_filter16_worstcase_neon: 97.2

Signed-off-by: Ben Avison 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c62bbd4d2015ffa717369e687601fb2d481af6b0
---

 libavcodec/aarch64/Makefile  |   1 +
 libavcodec/aarch64/vc1dsp_init_aarch64.c |  14 +
 libavcodec/aarch64/vc1dsp_neon.S | 692 +++
 3 files changed, 707 insertions(+)

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..5b25e4dfb9 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -48,6 +48,7 @@ NEON-OBJS-$(CONFIG_IDCTDSP) += 
aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)+= aarch64/mdct_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)+= aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
+NEON-OBJS-$(CONFIG_VC1DSP)  += aarch64/vc1dsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)  += aarch64/vp8dsp_neon.o
 
 # decoders/encoders
diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c 
b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index 13dfd74940..8f96e4802d 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -25,6 +25,13 @@
 
 #include "config.h"
 
+void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
+void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
+void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
+void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
+void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
+void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
+
 void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 int h, int x, int y);
 void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
@@ -39,6 +46,13 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
 int cpu_flags = av_get_cpu_flags();
 
 if (have_neon(cpu_flags)) {
+dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
+dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
+dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
+dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
+dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
+dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
+
 dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
 dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
 dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
new file mode 100644
index 00..1ea9fa75ff
--- /dev/null
+++ b/libavcodec/aarch64/vc1dsp_neon.S
@@ -0,0 +1,692 @@
+/*
+ * VC1 AArch64 NEON optimisations
+ *
+ * Copyright (c) 2022 Ben Avison 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it u

[FFmpeg-cvslog] avcodec/vc1: Arm 32-bit NEON deblocking filter fast paths

2022-04-01 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Thu Mar 31 
18:23:47 2022 +0100| [c07de58a725a508c628ddea7d936771c42c189aa] | committer: 
Martin Storsjö

avcodec/vc1: Arm 32-bit NEON deblocking filter fast paths

checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. Note that the C
version can still outperform the NEON version in specific cases. The balance
between different code paths is stream-dependent, but in practice the best
case happens about 5% of the time, the worst case happens about 40% of the
time, and the complexity of the remaining cases fall somewhere in between.
Therefore, taking the average of the best and worst case timings is
probably a conservative estimate of the degree by which the NEON code
improves performance.

vc1dsp.vc1_h_loop_filter4_bestcase_c: 19.0
vc1dsp.vc1_h_loop_filter4_bestcase_neon: 48.5
vc1dsp.vc1_h_loop_filter4_worstcase_c: 144.7
vc1dsp.vc1_h_loop_filter4_worstcase_neon: 76.2
vc1dsp.vc1_h_loop_filter8_bestcase_c: 41.0
vc1dsp.vc1_h_loop_filter8_bestcase_neon: 75.0
vc1dsp.vc1_h_loop_filter8_worstcase_c: 294.0
vc1dsp.vc1_h_loop_filter8_worstcase_neon: 102.7
vc1dsp.vc1_h_loop_filter16_bestcase_c: 54.7
vc1dsp.vc1_h_loop_filter16_bestcase_neon: 130.0
vc1dsp.vc1_h_loop_filter16_worstcase_c: 569.7
vc1dsp.vc1_h_loop_filter16_worstcase_neon: 186.7
vc1dsp.vc1_v_loop_filter4_bestcase_c: 20.2
vc1dsp.vc1_v_loop_filter4_bestcase_neon: 47.2
vc1dsp.vc1_v_loop_filter4_worstcase_c: 164.2
vc1dsp.vc1_v_loop_filter4_worstcase_neon: 68.5
vc1dsp.vc1_v_loop_filter8_bestcase_c: 43.5
vc1dsp.vc1_v_loop_filter8_bestcase_neon: 55.2
vc1dsp.vc1_v_loop_filter8_worstcase_c: 316.2
vc1dsp.vc1_v_loop_filter8_worstcase_neon: 72.7
vc1dsp.vc1_v_loop_filter16_bestcase_c: 62.2
vc1dsp.vc1_v_loop_filter16_bestcase_neon: 103.7
vc1dsp.vc1_v_loop_filter16_worstcase_c: 646.5
vc1dsp.vc1_v_loop_filter16_worstcase_neon: 110.7

Signed-off-by: Ben Avison 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c07de58a725a508c628ddea7d936771c42c189aa
---

 libavcodec/arm/vc1dsp_init_neon.c |  14 +
 libavcodec/arm/vc1dsp_neon.S  | 643 ++
 2 files changed, 657 insertions(+)

diff --git a/libavcodec/arm/vc1dsp_init_neon.c 
b/libavcodec/arm/vc1dsp_init_neon.c
index 2cca784f5a..f5f5c702d7 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -32,6 +32,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t 
stride, int16_t *bloc
 void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t 
*block);
 void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t 
*block);
 
+void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
+
 void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int rnd);
 
@@ -92,6 +99,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
 dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
 dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
 
+dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
+dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
+dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
+dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
+dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
+dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
+
 dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
 FN_ASSIGN(1, 0);
 FN_ASSIGN(2, 0);
diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
index 93f043bf08..ba54221ef6 100644
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -1161,3 +1161,646 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1
 vst1.32 {d1[1]},  [r0,:32]
 bx  lr
 endfunc
+
+@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of 
vertically-neighbouring blocks
+@ On entry:
+@   r0 -> top-left pel of lower block
+@   r1 = row stride, bytes
+@   r2 = PQUANT bitstream parameter
+function ff_vc1_v_loop_filter4_neon, export=1
+sub r3, r0, r1, lsl #2
+vldrd0, .Lcoeffs
+vld1.32 {d1[0]}, [r0], r1   @ P5
+vld1.32 {d2[0]}, [r3], r1   @ P1
+vld1.32 {d3[0]}, [r3], r1   @ P2
+vld1.32 {d4[0]}, [r0], r1   @ P6
+vld1.32 {d5[0]}, [r3], r1   @ P3
+vld1.32 {d6[0]}, [r0], r1   @ P7
+vld1.32 {d7[0]}, [r3]   @ P4
+vld1.32 {d16[0]}, [r0] 

[FFmpeg-cvslog] avcodec/vc1: Arm 64-bit NEON inverse transform fast paths

2022-04-01 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Thu Mar 31 
18:23:48 2022 +0100| [501fdc017deb1b57ecc17420ba41686a14932fcc] | committer: 
Martin Storsjö

avcodec/vc1: Arm 64-bit NEON inverse transform fast paths

checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows.

vc1dsp.vc1_inv_trans_4x4_c: 158.2
vc1dsp.vc1_inv_trans_4x4_neon: 65.7
vc1dsp.vc1_inv_trans_4x4_dc_c: 86.5
vc1dsp.vc1_inv_trans_4x4_dc_neon: 26.5
vc1dsp.vc1_inv_trans_4x8_c: 335.2
vc1dsp.vc1_inv_trans_4x8_neon: 106.2
vc1dsp.vc1_inv_trans_4x8_dc_c: 151.2
vc1dsp.vc1_inv_trans_4x8_dc_neon: 25.5
vc1dsp.vc1_inv_trans_8x4_c: 365.7
vc1dsp.vc1_inv_trans_8x4_neon: 97.2
vc1dsp.vc1_inv_trans_8x4_dc_c: 139.7
vc1dsp.vc1_inv_trans_8x4_dc_neon: 16.5
vc1dsp.vc1_inv_trans_8x8_c: 547.7
vc1dsp.vc1_inv_trans_8x8_neon: 137.0
vc1dsp.vc1_inv_trans_8x8_dc_c: 268.2
vc1dsp.vc1_inv_trans_8x8_dc_neon: 30.5

Signed-off-by: Ben Avison 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=501fdc017deb1b57ecc17420ba41686a14932fcc
---

 libavcodec/aarch64/vc1dsp_init_aarch64.c |  19 +
 libavcodec/aarch64/vc1dsp_neon.S | 678 +++
 2 files changed, 697 insertions(+)

diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c 
b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index 8f96e4802d..e0eb52dd63 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -25,6 +25,16 @@
 
 #include "config.h"
 
+void ff_vc1_inv_trans_8x8_neon(int16_t *block);
+void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t 
*block);
+void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t 
*block);
+void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t 
*block);
+
+void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t 
*block);
+void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t 
*block);
+void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t 
*block);
+void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t 
*block);
+
 void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
 void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
 void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
@@ -46,6 +56,15 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
 int cpu_flags = av_get_cpu_flags();
 
 if (have_neon(cpu_flags)) {
+dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
+dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
+dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
+dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
+dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
+dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
+dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
+dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
+
 dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
 dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
 dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
index 1ea9fa75ff..0201db4f78 100644
--- a/libavcodec/aarch64/vc1dsp_neon.S
+++ b/libavcodec/aarch64/vc1dsp_neon.S
@@ -22,7 +22,685 @@
 
 #include "libavutil/aarch64/asm.S"
 
+// VC-1 8x8 inverse transform
+// On entry:
+//   x0 -> array of 16-bit inverse transform coefficients, in column-major 
order
+// On exit:
+//   array at x0 updated to hold transformed block; also now held in row-major 
order
+function ff_vc1_inv_trans_8x8_neon, export=1
+ld1 {v1.16b, v2.16b}, [x0], #32
+ld1 {v3.16b, v4.16b}, [x0], #32
+ld1 {v5.16b, v6.16b}, [x0], #32
+shl v1.8h, v1.8h, #2// 8/2 * src[0]
+sub x1, x0, #3*32
+ld1 {v16.16b, v17.16b}, [x0]
+shl v7.8h, v2.8h, #4//  16 * src[8]
+shl v18.8h, v2.8h, #2   //   4 * src[8]
+shl v19.8h, v4.8h, #4   //16 * 
src[24]
+ldr d0, .Lcoeffs_it8
+shl v5.8h, v5.8h, #2// 
 8/2 * src[32]
+shl v20.8h, v6.8h, #4   // 
  16 * src[40]
+shl v21.8h, v6.8h, #2   // 
   4 * src[40]
+shl v22.8h, v17.8h, #4  // 
 16 * src[56]
+ssrav20.8h, v19.8h, #2  // 4 * 
src[24] + 16 * src[40]
+mul v23.8h, v3.8h, v0.h[0]  //   6/2

[FFmpeg-cvslog] avcodec/vc1: Arm 32-bit NEON unescape fast path

2022-04-01 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Thu Mar 31 
18:23:51 2022 +0100| [23c92e14f5fdb0c2928b44bb94d4c0711439e1c7] | committer: 
Martin Storsjö

avcodec/vc1: Arm 32-bit NEON unescape fast path

checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows.

vc1dsp.vc1_unescape_buffer_c: 918624.7
vc1dsp.vc1_unescape_buffer_neon: 142958.0

Signed-off-by: Ben Avison 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=23c92e14f5fdb0c2928b44bb94d4c0711439e1c7
---

 libavcodec/arm/vc1dsp_init_neon.c |  61 
 libavcodec/arm/vc1dsp_neon.S  | 118 ++
 2 files changed, 179 insertions(+)

diff --git a/libavcodec/arm/vc1dsp_init_neon.c 
b/libavcodec/arm/vc1dsp_init_neon.c
index f5f5c702d7..48cb816b70 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -19,6 +19,7 @@
 #include 
 
 #include "libavutil/attributes.h"
+#include "libavutil/intreadwrite.h"
 #include "libavcodec/vc1dsp.h"
 #include "vc1dsp.h"
 
@@ -84,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, 
ptrdiff_t stride,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 int h, int x, int y);
 
+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t 
*dst);
+
+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
+{
+/* Dealing with starting and stopping, and removing escape bytes, are
+ * comparatively less time-sensitive, so are more clearly expressed using
+ * a C wrapper around the assembly inner loop. Note that we assume a
+ * little-endian machine that supports unaligned loads. */
+int dsize = 0;
+while (size >= 4)
+{
+int found = 0;
+while (!found && (((uintptr_t) dst) & 7) && size >= 4)
+{
+found = (AV_RL32(src) &~ 0x0300) == 0x0003;
+if (!found)
+{
+*dst++ = *src++;
+--size;
+++dsize;
+}
+}
+if (!found)
+{
+int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, 
dst);
+dst += skip;
+src += skip;
+size -= skip;
+dsize += skip;
+while (!found && size >= 4)
+{
+found = (AV_RL32(src) &~ 0x0300) == 0x0003;
+if (!found)
+{
+*dst++ = *src++;
+--size;
+++dsize;
+}
+}
+}
+if (found)
+{
+*dst++ = *src++;
+*dst++ = *src++;
+++src;
+size -= 3;
+dsize += 2;
+}
+}
+while (size > 0)
+{
+*dst++ = *src++;
+--size;
+++dsize;
+}
+return dsize;
+}
+
 #define FN_ASSIGN(X, Y) \
 dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = 
ff_put_vc1_mspel_mc##X##Y##_16_neon; \
 dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
@@ -130,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
 dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
 dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
 dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
+
+dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
 }
diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
index ba54221ef6..96014fbebc 100644
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -1804,3 +1804,121 @@ function ff_vc1_h_loop_filter16_neon, export=1
 4:  vpop{d8-d15}
 pop {r4-r6,pc}
 endfunc
+
+@ Copy at most the specified number of bytes from source to destination buffer,
+@ stopping at a multiple of 16 bytes, none of which are the start of an escape 
sequence
+@ On entry:
+@   r0 -> source buffer
+@   r1 = max number of bytes to copy
+@   r2 -> destination buffer, optimally 8-byte aligned
+@ On exit:
+@   r0 = number of bytes not copied
+function ff_vc1_unescape_buffer_helper_neon, export=1
+@ Offset by 48 to screen out cases that are too short for us to handle,
+@ and also make it easy to test for loop termination, or to determine
+@ whether we need an odd number of half-iterations of the loop.
+subsr1, r1, #48
+bmi 90f
+
+@ Set up useful constants
+vmov.i32q0, #0x300
+vmov.i32q1, #0x3
+
+tst r1, #16
+bne 1f
+
+  vld1.8  {q8, q9}, [r0]!
+  vbicq12, q8, q0
+  vext.8  q13, q8, q9, #1
+  vext.8  q14, q8, q9, #2
+  

[FFmpeg-cvslog] avcodec/idctdsp: Arm 64-bit NEON block add and clamp fast paths

2022-04-01 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Thu Mar 31 
18:23:49 2022 +0100| [5379412ed0c587d82788c6fc46b7787cfe10f72d] | committer: 
Martin Storsjö

avcodec/idctdsp: Arm 64-bit NEON block add and clamp fast paths

checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows.

idctdsp.add_pixels_clamped_c: 313.3
idctdsp.add_pixels_clamped_neon: 24.3
idctdsp.put_pixels_clamped_c: 220.3
idctdsp.put_pixels_clamped_neon: 15.5
idctdsp.put_signed_pixels_clamped_c: 210.5
idctdsp.put_signed_pixels_clamped_neon: 19.5

Signed-off-by: Ben Avison 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5379412ed0c587d82788c6fc46b7787cfe10f72d
---

 libavcodec/aarch64/Makefile   |   3 +-
 libavcodec/aarch64/idctdsp_init_aarch64.c |  26 --
 libavcodec/aarch64/idctdsp_neon.S | 130 ++
 3 files changed, 150 insertions(+), 9 deletions(-)

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 5b25e4dfb9..c8935f205e 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -44,7 +44,8 @@ NEON-OBJS-$(CONFIG_H264PRED)+= 
aarch64/h264pred_neon.o
 NEON-OBJS-$(CONFIG_H264QPEL)+= aarch64/h264qpel_neon.o 
\
aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
-NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o
+NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o  
\
+   aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)+= aarch64/mdct_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)+= aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c 
b/libavcodec/aarch64/idctdsp_init_aarch64.c
index 742a3372e3..eec21aa5a2 100644
--- a/libavcodec/aarch64/idctdsp_init_aarch64.c
+++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
@@ -27,19 +27,29 @@
 #include "libavcodec/idctdsp.h"
 #include "idct.h"
 
+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+
 av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
  unsigned high_bit_depth)
 {
 int cpu_flags = av_get_cpu_flags();
 
-if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) {
-if (avctx->idct_algo == FF_IDCT_AUTO ||
-avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
-avctx->idct_algo == FF_IDCT_SIMPLENEON) {
-c->idct_put  = ff_simple_idct_put_neon;
-c->idct_add  = ff_simple_idct_add_neon;
-c->idct  = ff_simple_idct_neon;
-c->perm_type = FF_IDCT_PERM_PARTTRANS;
+if (have_neon(cpu_flags)) {
+if (!avctx->lowres && !high_bit_depth) {
+if (avctx->idct_algo == FF_IDCT_AUTO ||
+avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+c->idct_put  = ff_simple_idct_put_neon;
+c->idct_add  = ff_simple_idct_add_neon;
+c->idct  = ff_simple_idct_neon;
+c->perm_type = FF_IDCT_PERM_PARTTRANS;
+}
 }
+
+c->add_pixels_clamped= ff_add_pixels_clamped_neon;
+c->put_pixels_clamped= ff_put_pixels_clamped_neon;
+c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
 }
 }
diff --git a/libavcodec/aarch64/idctdsp_neon.S 
b/libavcodec/aarch64/idctdsp_neon.S
new file mode 100644
index 00..7f47611206
--- /dev/null
+++ b/libavcodec/aarch64/idctdsp_neon.S
@@ -0,0 +1,130 @@
+/*
+ * IDCT AArch64 NEON optimisations
+ *
+ * Copyright (c) 2022 Ben Avison 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// Clamp 16-bit signed block coefficients to unsigned 8-bit
+// 

[FFmpeg-cvslog] avcodec/vc1: Arm 64-bit NEON unescape fast path

2022-04-01 Thread Ben Avison
ffmpeg | branch: master | Ben Avison  | Thu Mar 31 
18:23:50 2022 +0100| [6eee65028957c3b16287a204e648caebcc86b06c] | committer: 
Martin Storsjö

avcodec/vc1: Arm 64-bit NEON unescape fast path

checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows.

vc1dsp.vc1_unescape_buffer_c: 655617.7
vc1dsp.vc1_unescape_buffer_neon: 118237.0

Signed-off-by: Ben Avison 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=6eee65028957c3b16287a204e648caebcc86b06c
---

 libavcodec/aarch64/vc1dsp_init_aarch64.c |  61 +++
 libavcodec/aarch64/vc1dsp_neon.S | 176 +++
 2 files changed, 237 insertions(+)

diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c 
b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index e0eb52dd63..a7976fd596 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -21,6 +21,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/aarch64/cpu.h"
+#include "libavutil/intreadwrite.h"
 #include "libavcodec/vc1dsp.h"
 
 #include "config.h"
@@ -51,6 +52,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, 
ptrdiff_t stride,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 int h, int x, int y);
 
+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t 
*dst);
+
+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
+{
+/* Dealing with starting and stopping, and removing escape bytes, are
+ * comparatively less time-sensitive, so are more clearly expressed using
+ * a C wrapper around the assembly inner loop. Note that we assume a
+ * little-endian machine that supports unaligned loads. */
+int dsize = 0;
+while (size >= 4)
+{
+int found = 0;
+while (!found && (((uintptr_t) dst) & 7) && size >= 4)
+{
+found = (AV_RL32(src) &~ 0x0300) == 0x0003;
+if (!found)
+{
+*dst++ = *src++;
+--size;
+++dsize;
+}
+}
+if (!found)
+{
+int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, 
dst);
+dst += skip;
+src += skip;
+size -= skip;
+dsize += skip;
+while (!found && size >= 4)
+{
+found = (AV_RL32(src) &~ 0x0300) == 0x0003;
+if (!found)
+{
+*dst++ = *src++;
+--size;
+++dsize;
+}
+}
+}
+if (found)
+{
+*dst++ = *src++;
+*dst++ = *src++;
+++src;
+size -= 3;
+dsize += 2;
+}
+}
+while (size > 0)
+{
+*dst++ = *src++;
+--size;
+++dsize;
+}
+return dsize;
+}
+
 av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
 {
 int cpu_flags = av_get_cpu_flags();
@@ -76,5 +135,7 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
 dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
 dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
 dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
+
+dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
 }
 }
diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
index 0201db4f78..9a96c2523c 100644
--- a/libavcodec/aarch64/vc1dsp_neon.S
+++ b/libavcodec/aarch64/vc1dsp_neon.S
@@ -1368,3 +1368,179 @@ function ff_vc1_h_loop_filter16_neon, export=1
 st2 {v2.b, v3.b}[7], [x6]
 4:  ret
 endfunc
+
+// Copy at most the specified number of bytes from source to destination 
buffer,
+// stopping at a multiple of 32 bytes, none of which are the start of an 
escape sequence
+// On entry:
+//   x0 -> source buffer
+//   w1 = max number of bytes to copy
+//   x2 -> destination buffer, optimally 8-byte aligned
+// On exit:
+//   w0 = number of bytes not copied
+function ff_vc1_unescape_buffer_helper_neon, export=1
+// Offset by 80 to screen out cases that are too short for us to 
handle,
+// and also make it easy to test for loop termination, or to determine
+// whether we need an odd number of half-iterations of the loop.
+subsw1, w1, #80
+b.mi90f
+
+// Set up useful constants
+moviv20.4s, #3, lsl #24
+moviv21.4s, #3, lsl #16
+
+tst w1, #32
+b.ne1f
+
+  ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48
+  ext v25.16b, v0.16b, v1.16b