[FFmpeg-cvslog] armv6: Accelerate ff_imdct_half for general case (mdct_bits != 6)
ffmpeg | branch: master | Ben Avison | Fri Jul 11 00:14:28 2014 +0100| [42c1cc35b7623ce76c7b55c6bc100f135e17cd4f] | committer: Michael Niedermayer armv6: Accelerate ff_imdct_half for general case (mdct_bits != 6) The previous implementation targeted DTS Coherent Acoustics, which only requires mdct_bits == 6. This relatively small size lent itself to unrolling the loops a small number of times, and encoding offsets calculated at assembly time within the load/store instructions of each iteration. In the more general case (codecs such as AAC and AC3) much larger arrays are used - mdct_bits == [8, 9, 11]. The old method does not scale for these cases, so more integer registers are used with non-unrolled versions of the loops (and with some stack spillage). The postrotation filter loop is still unrolled by a factor of 2 to permit the double-buffering of some VFP registers to facilitate overlap of neighbouring iterations. I benchmarked the result by measuring the number of gperftools samples that hit anywhere in the AAC decoder (starting from aac_decode_frame()) or specifically in ff_imdct_half_c / ff_imdct_half_vfp, for the same example AAC stream: Before After Mean StdDev Mean StdDev Confidence Change aac_decode_frame 2368.1 35.8 2117.2 35.3100.0% +11.8% ff_imdct_half_* 457.5 22.4 251.2 16.2100.0% +82.1% Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=42c1cc35b7623ce76c7b55c6bc100f135e17cd4f --- libavcodec/arm/mdct_vfp.S | 146 - 1 file changed, 144 insertions(+), 2 deletions(-) diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S index ee3984c..43f6d14 100644 --- a/libavcodec/arm/mdct_vfp.S +++ b/libavcodec/arm/mdct_vfp.S @@ -33,6 +33,11 @@ J0 .reqa2 J1 .reqa4 J2 .reqip J3 .reqlr +REVTAB_HI .req v5 +IN_HI .reqv6 +OUT_HI .reqv6 +TCOS_HI .reqsl +TSIN_HI .reqfp .macro prerotation_innerloop .set trig_lo, k @@ -76,6 +81,43 @@ J3 .reqlr .set k, k + 2 .endm +.macro prerotation_innerloop_rolled +vldmia TCOS!, {s16,s17} +vldmdb TCOS_HI!, {s18,s19} +vldrs0, [IN_HI, #-4] +vldrs1, [IN_HI, #-12] +vldrs2, [IN, #12] +vldrs3, [IN, #4] +vmul.f s8, s0, s16 @ vector operation +vldmia TSIN!, {s20,s21} +vldmdb TSIN_HI!, {s22,s23} +vldrs4, [IN] +vldrs5, [IN, #8] +vldrs6, [IN_HI, #-16] +vldrs7, [IN_HI, #-8] +vmul.f s12, s0, s20@ vector operation +add IN, IN, #16 +sub IN_HI, IN_HI, #16 +ldrhJ0, [REVTAB], #2 +ldrhJ1, [REVTAB], #2 +vmls.f s8, s4, s20 @ vector operation +ldrhJ3, [REVTAB_HI, #-2]! +ldrhJ2, [REVTAB_HI, #-2]! +add J0, OUT, J0, lsl #3 +vmla.f s12, s4, s16@ vector operation +add J1, OUT, J1, lsl #3 +add J2, OUT, J2, lsl #3 +add J3, OUT, J3, lsl #3 +vstrs8, [J0] +vstrs9, [J1] +vstrs10, [J2] +vstrs11, [J3] +vstrs12, [J0, #4] +vstrs13, [J1, #4] +vstrs14, [J2, #4] +vstrs15, [J3, #4] +.endm + .macro postrotation_innerloop tail, head .set trig_lo_head, n8 - k - 2 .set trig_hi_head, n8 + k @@ -142,6 +184,49 @@ J3 .reqlr .endif .endm +.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail + .ifnc "\tail","" +vmls.f s8, s0, \tcos_s0_tail @ vector operation + .endif + .ifnc "\head","" +vldmia TSIN!, {s16,s17} +vldmdb TSIN_HI!, {s18,s19} +vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head} + .endif + .ifnc "\tail","" +vmla.f s12, s4, \tcos_s0_tail @ vector operation + .endif + .ifnc "\head","" +vldrs0, [OUT, #+\out_offset_head+0] +vldrs1, [OUT, #+\out_offset_head+8] +vldrs2, [OUT_HI, #-\out_offset_head-16] +vldrs3, [OUT_HI, #-\out_offset_head-8] +vldrs4, [OUT, #+\out_offset_head+4] +vldrs5, [OUT, #+\out_offset_head+12] +vldrs6, [OUT_HI, #-\out_offset_head-12] +vldrs7, [OUT_HI, #-\out_offset_head-4] + .endif + .ifnc "\tail","" +vstrs8, [OUT, #+\out_offset_tail+0] +vstrs9, [OUT, #+\out_offset_tail+8] +vstrs10, [OUT_HI, #-\out_offset_tail-16] +vstrs11, [OUT_HI, #-\out_offset_tail-8] + .endif + .ifnc "\head","" +vmul.f s8, s4, s16 @ vector operation + .endi
[FFmpeg-cvslog] armv6: Accelerate butterflies_float
ffmpeg | branch: master | Ben Avison | Fri Jul 11 00:14:31 2014 +0100| [57641410d1a386937bec3fddd6c75119550916ec] | committer: Michael Niedermayer armv6: Accelerate butterflies_float I benchmarked the result by measuring the number of gperftools samples that hit anywhere in the AAC decoder (starting from aac_decode_frame()) or specifically in butterflies_float_c() / ff_butterflies_float_vfp() for the same sample AAC stream: Before After Mean StdDev Mean StdDev Confidence Change Audio decode 1542.8 43.7 1470.5 41.5100.0% +4.9% butterflies_float 130.0 11.9 70.2 12.1100.0% +85.2% Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=57641410d1a386937bec3fddd6c75119550916ec --- libavutil/arm/float_dsp_init_vfp.c |4 ++ libavutil/arm/float_dsp_vfp.S | 116 2 files changed, 120 insertions(+) diff --git a/libavutil/arm/float_dsp_init_vfp.c b/libavutil/arm/float_dsp_init_vfp.c index 4dfe012..45508b8 100644 --- a/libavutil/arm/float_dsp_init_vfp.c +++ b/libavutil/arm/float_dsp_init_vfp.c @@ -32,6 +32,8 @@ void ff_vector_fmul_window_vfp(float *dst, const float *src0, void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len); +void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len); + av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags) { if (!have_vfpv3(cpu_flags)) { @@ -39,4 +41,6 @@ av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags) fdsp->vector_fmul_window = ff_vector_fmul_window_vfp; } fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; +if (!have_vfpv3(cpu_flags)) +fdsp->butterflies_float = ff_butterflies_float_vfp; } diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S index 13ff219..7db2452 100644 --- a/libavutil/arm/float_dsp_vfp.S +++ b/libavutil/arm/float_dsp_vfp.S @@ -339,3 +339,119 @@ function ff_vector_fmul_reverse_vfp, export=1 vpop{d8-d15} bx lr endfunc + +/** + * ARM VFP implementation of 'butterflies_float_c' function + * Assume that len is a positive non-zero number + */ +@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len) +function ff_butterflies_float_vfp, export=1 +BASE1 .reqa1 +BASE2 .reqa2 +LEN .reqa3 +OLDFPSCR .req a4 + +vpush {s16-s31} +fmrxOLDFPSCR, FPSCR + +tst LEN, #7 +beq 4f @ common case: len is a multiple of 8 + +ldr ip, =0x0300 @ RunFast mode, scalar mode +fmxrFPSCR, ip + +tst LEN, #1 +beq 1f +vldmia BASE1!, {s0} +vldmia BASE2!, {s8} +vadd.f s16, s0, s8 +vsub.f s24, s0, s8 +vstrs16, [BASE1, #0-4*1] +vstrs24, [BASE2, #0-4*1] +1: +tst LEN, #2 +beq 2f +vldmia BASE1!, {s0-s1} +vldmia BASE2!, {s8-s9} +vadd.f s16, s0, s8 +vadd.f s17, s1, s9 +vsub.f s24, s0, s8 +vsub.f s25, s1, s9 +vstrd8, [BASE1, #0-8*1]@ s16,s17 +vstrd12, [BASE2, #0-8*1] @ s24,s25 +2: +tst LEN, #4 +beq 3f +vldmia BASE1!, {s0-s1} +vldmia BASE2!, {s8-s9} +vldmia BASE1!, {s2-s3} +vldmia BASE2!, {s10-s11} +vadd.f s16, s0, s8 +vadd.f s17, s1, s9 +vsub.f s24, s0, s8 +vsub.f s25, s1, s9 +vadd.f s18, s2, s10 +vadd.f s19, s3, s11 +vsub.f s26, s2, s10 +vsub.f s27, s3, s11 +vstrd8, [BASE1, #0-16*1]@ s16,s17 +vstrd12, [BASE2, #0-16*1] @ s24,s25 +vstrd9, [BASE1, #8-16*1]@ s18,s19 +vstrd13, [BASE2, #8-16*1] @ s26,s27 +3: +bicsLEN, LEN, #7 +beq 7f +4: +ldr ip, =0x0303 @ RunFast mode, short vectors of length 4, stride 1 +fmxrFPSCR, ip + +vldmia BASE1!, {s0-s1} +vldmia BASE2!, {s8-s9} +vldmia BASE1!, {s2-s3} +vldmia BASE2!, {s10-s11} +vadd.f s16, s0, s8 +vldmia BASE1!, {s4-s5} +vldmia BASE2!, {s12-s13} +vldmia BASE1!, {s6-s7} +vldmia BASE2!, {s14-s15} +vsub.f s24, s0, s8 +vadd.f s20, s4, s12 +subsLEN, LEN, #8 +beq 6f +5: vldmia BASE1!, {s0-s3} +vldmia BASE2!, {s8-s11} +vsub.f s28, s4, s12 +vstrd8, [BASE1, #0-16*3]@ s16,s17 +vstrd9, [BASE1, #8-16*3]@ s18,s19 +vstrd12, [BASE2, #0-16*3] @ s24,s25 +vstrd13, [BASE2, #8-16*3] @ s26,s27 +
[FFmpeg-cvslog] armv6: Accelerate vector_fmul_window
ffmpeg | branch: master | Ben Avison | Fri Jul 11 00:14:30 2014 +0100| [649c666137f43542b45941f42034ab3f44a31d38] | committer: Michael Niedermayer armv6: Accelerate vector_fmul_window I benchmarked the result by measuring the number of gperftools samples that hit anywhere in the AAC decoder (starting from aac_decode_frame()) or specifically in vector_fmul_window_c() / ff_vector_fmul_window_vfp() for the same sample AAC stream: Before After Mean StdDev Mean StdDev Confidence Change Audio decode1598.2 47.4 1529.2 25.4100.0% +4.5% vector_fmul_window 244.0 22.1 188.9 22.3100.0% +29.2% Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=649c666137f43542b45941f42034ab3f44a31d38 --- libavutil/arm/float_dsp_init_vfp.c |7 +- libavutil/arm/float_dsp_vfp.S | 204 2 files changed, 210 insertions(+), 1 deletion(-) diff --git a/libavutil/arm/float_dsp_init_vfp.c b/libavutil/arm/float_dsp_init_vfp.c index 1fe52ab..4dfe012 100644 --- a/libavutil/arm/float_dsp_init_vfp.c +++ b/libavutil/arm/float_dsp_init_vfp.c @@ -26,12 +26,17 @@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len); +void ff_vector_fmul_window_vfp(float *dst, const float *src0, + const float *src1, const float *win, int len); + void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len); av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags) { -if (!have_vfpv3(cpu_flags)) +if (!have_vfpv3(cpu_flags)) { fdsp->vector_fmul = ff_vector_fmul_vfp; +fdsp->vector_fmul_window = ff_vector_fmul_window_vfp; +} fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; } diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S index 8695fbd..13ff219 100644 --- a/libavutil/arm/float_dsp_vfp.S +++ b/libavutil/arm/float_dsp_vfp.S @@ -68,6 +68,210 @@ function ff_vector_fmul_vfp, export=1 endfunc /** + * ARM VFP implementation of 'vector_fmul_window_c' function + * Assume that len is a positive non-zero number + */ +@ void ff_vector_fmul_window_vfp(float *dst, const float *src0, +@const float *src1, const float *win, int len) +function ff_vector_fmul_window_vfp, export=1 +DST0.reqa1 +SRC0.reqa2 +SRC1.reqa3 +WIN0.reqa4 +LEN .reqv1 +DST1.reqv2 +WIN1.reqv3 +OLDFPSCR .req ip + +push{v1-v3,lr} +ldr LEN, [sp, #4*4+0] +vpush {s16-s31} +fmrxOLDFPSCR, FPSCR +add DST1, DST0, LEN, lsl #3 +add SRC1, SRC1, LEN, lsl #2 +add WIN1, WIN0, LEN, lsl #3 + +tst LEN, #7 +beq 4f @ common case: len is a multiple of 8 + +ldr lr, =0x0300 @ RunFast mode, scalar mode +fmxrFPSCR, lr + +tst LEN, #1 +beq 1f +vldmdb WIN1!, {s0} +vldmia SRC0!, {s8} +vldmia WIN0!, {s16} +vmul.f s24, s0, s8 +vldmdb SRC1!, {s20} +vmul.f s8, s16, s8 +vmls.f s24, s16, s20 +vmla.f s8, s0, s20 +vstmia DST0!, {s24} +vstmdb DST1!, {s8} +1: +tst LEN, #2 +beq 2f +vldmdb WIN1!, {s0} +vldmdb WIN1!, {s1} +vldmia SRC0!, {s8-s9} +vldmia WIN0!, {s16-s17} +vmul.f s24, s0, s8 +vmul.f s25, s1, s9 +vldmdb SRC1!, {s20} +vldmdb SRC1!, {s21} +vmul.f s8, s16, s8 +vmul.f s9, s17, s9 +vmls.f s24, s16, s20 +vmls.f s25, s17, s21 +vmla.f s8, s0, s20 +vmla.f s9, s1, s21 +vstmia DST0!, {s24-s25} +vstmdb DST1!, {s8} +vstmdb DST1!, {s9} +2: +tst LEN, #4 +beq 3f +vldmdb WIN1!, {s0} +vldmdb WIN1!, {s1} +vldmdb WIN1!, {s2} +vldmdb WIN1!, {s3} +vldmia SRC0!, {s8-s11} +vldmia WIN0!, {s16-s19} +vmul.f s24, s0, s8 +vmul.f s25, s1, s9 +vmul.f s26, s2, s10 +vmul.f s27, s3, s11 +vldmdb SRC1!, {s20} +vldmdb SRC1!, {s21} +vldmdb SRC1!, {s22} +vldmdb SRC1!, {s23} +vmul.f s8, s16, s8 +vmul.f s9, s17, s9 +vmul.f s10, s18, s10 +vmul.f s11, s19, s11 +vmls.f s24, s16, s20 +vmls.f s25, s17, s21 +vmls.f s26, s18, s22 +vmls.f s27, s19, s23 +vmla.f s8, s0, s20 +vmla.f s9, s1, s21 +vmla.f s10, s2, s22 +vmla.f s11, s3, s23 +vstmia DST0!, {s24-s27} +vstmdb DST1!, {s8} +vstmdb DST1!, {s9} +vstmdb DST1!,
[FFmpeg-cvslog] armv6: Accelerate ff_imdct_half for general case (mdct_bits != 6)
ffmpeg | branch: master | Ben Avison | Fri Jul 11 00:12:31 2014 +0100| [5c22e8e4ad0852d61d5c4ba8d67d33fd72339497] | committer: Martin Storsjö armv6: Accelerate ff_imdct_half for general case (mdct_bits != 6) The previous implementation targeted DTS Coherent Acoustics, which only requires mdct_bits == 6. This relatively small size lent itself to unrolling the loops a small number of times, and encoding offsets calculated at assembly time within the load/store instructions of each iteration. In the more general case (codecs such as AAC and AC3) much larger arrays are used - mdct_bits == [8, 9, 11]. The old method does not scale for these cases, so more integer registers are used with non-unrolled versions of the loops (and with some stack spillage). The postrotation filter loop is still unrolled by a factor of 2 to permit the double-buffering of some VFP registers to facilitate overlap of neighbouring iterations. I benchmarked the result by measuring the number of gperftools samples that hit anywhere in the AAC decoder (starting from aac_decode_frame()) or specifically in ff_imdct_half_c / ff_imdct_half_vfp, for the same example AAC stream: Before After Mean StdDev Mean StdDev Confidence Change aac_decode_frame 2368.1 35.8 2117.2 35.3100.0% +11.8% ff_imdct_half_* 457.5 22.4 251.2 16.2100.0% +82.1% Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5c22e8e4ad0852d61d5c4ba8d67d33fd72339497 --- libavcodec/arm/mdct_vfp.S | 146 - 1 file changed, 144 insertions(+), 2 deletions(-) diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S index 94db24f..f3fe668 100644 --- a/libavcodec/arm/mdct_vfp.S +++ b/libavcodec/arm/mdct_vfp.S @@ -33,6 +33,11 @@ J0 .reqa2 J1 .reqa4 J2 .reqip J3 .reqlr +REVTAB_HI .req v5 +IN_HI .reqv6 +OUT_HI .reqv6 +TCOS_HI .reqsl +TSIN_HI .reqfp .macro prerotation_innerloop .set trig_lo, k @@ -76,6 +81,43 @@ J3 .reqlr .set k, k + 2 .endm +.macro prerotation_innerloop_rolled +vldmia TCOS!, {s16,s17} +vldmdb TCOS_HI!, {s18,s19} +vldrs0, [IN_HI, #-4] +vldrs1, [IN_HI, #-12] +vldrs2, [IN, #12] +vldrs3, [IN, #4] +vmul.f s8, s0, s16 @ vector operation +vldmia TSIN!, {s20,s21} +vldmdb TSIN_HI!, {s22,s23} +vldrs4, [IN] +vldrs5, [IN, #8] +vldrs6, [IN_HI, #-16] +vldrs7, [IN_HI, #-8] +vmul.f s12, s0, s20@ vector operation +add IN, IN, #16 +sub IN_HI, IN_HI, #16 +ldrhJ0, [REVTAB], #2 +ldrhJ1, [REVTAB], #2 +vmls.f s8, s4, s20 @ vector operation +ldrhJ3, [REVTAB_HI, #-2]! +ldrhJ2, [REVTAB_HI, #-2]! +add J0, OUT, J0, lsl #3 +vmla.f s12, s4, s16@ vector operation +add J1, OUT, J1, lsl #3 +add J2, OUT, J2, lsl #3 +add J3, OUT, J3, lsl #3 +vstrs8, [J0] +vstrs9, [J1] +vstrs10, [J2] +vstrs11, [J3] +vstrs12, [J0, #4] +vstrs13, [J1, #4] +vstrs14, [J2, #4] +vstrs15, [J3, #4] +.endm + .macro postrotation_innerloop tail, head .set trig_lo_head, n8 - k - 2 .set trig_hi_head, n8 + k @@ -142,6 +184,49 @@ J3 .reqlr .endif .endm +.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail + .ifnc "\tail","" +vmls.f s8, s0, \tcos_s0_tail @ vector operation + .endif + .ifnc "\head","" +vldmia TSIN!, {s16,s17} +vldmdb TSIN_HI!, {s18,s19} +vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head} + .endif + .ifnc "\tail","" +vmla.f s12, s4, \tcos_s0_tail @ vector operation + .endif + .ifnc "\head","" +vldrs0, [OUT, #+\out_offset_head+0] +vldrs1, [OUT, #+\out_offset_head+8] +vldrs2, [OUT_HI, #-\out_offset_head-16] +vldrs3, [OUT_HI, #-\out_offset_head-8] +vldrs4, [OUT, #+\out_offset_head+4] +vldrs5, [OUT, #+\out_offset_head+12] +vldrs6, [OUT_HI, #-\out_offset_head-12] +vldrs7, [OUT_HI, #-\out_offset_head-4] + .endif + .ifnc "\tail","" +vstrs8, [OUT, #+\out_offset_tail+0] +vstrs9, [OUT, #+\out_offset_tail+8] +vstrs10, [OUT_HI, #-\out_offset_tail-16] +vstrs11, [OUT_HI, #-\out_offset_tail-8] + .endif + .ifnc "\head","" +vmul.f s8, s4, s16 @ vector operation + .endif +
[FFmpeg-cvslog] armv6: Accelerate ff_fft_calc for general case (nbits != 4)
ffmpeg | branch: master | Ben Avison | Wed Jul 16 16:02:01 2014 +0100| [87552d54d3337c3241e8a9e1a05df16eaa821496] | committer: Martin Storsjö armv6: Accelerate ff_fft_calc for general case (nbits != 4) The previous implementation targeted DTS Coherent Acoustics, which only requires nbits == 4 (fft16()). This case was (and still is) linked directly rather than being indirected through ff_fft_calc_vfp(), but now the full range from radix-4 up to radix-65536 is available. This benefits other codecs such as AAC and AC3. The implementaion is based upon the C version, with each routine larger than radix-16 calling a hierarchy of smaller FFT functions, then performing a post-processing pass. This pass benefits a lot from loop unrolling to counter the long pipelines in the VFP. A relaxed calling standard also reduces the overhead of the call hierarchy, and avoiding the excessive inlining performed by GCC probably helps with I-cache utilisation too. I benchmarked the result by measuring the number of gperftools samples that hit anywhere in the AAC decoder (starting from aac_decode_frame()) or specifically in the FFT routines (fft4() to fft512() and pass()) for the same sample AAC stream: Before After Mean StdDev Mean StdDev Confidence Change Audio decode 2245.5 53.1 1599.6 43.8100.0% +40.4% FFT routines 940.6 22.0 348.1 20.8100.0% +170.2% Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=87552d54d3337c3241e8a9e1a05df16eaa821496 --- libavcodec/arm/fft_init_arm.c |8 +- libavcodec/arm/fft_vfp.S | 264 ++--- 2 files changed, 255 insertions(+), 17 deletions(-) diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c index 3a3d1a7..bc143c1 100644 --- a/libavcodec/arm/fft_init_arm.c +++ b/libavcodec/arm/fft_init_arm.c @@ -23,6 +23,8 @@ #include "libavcodec/rdft.h" #include "libavcodec/synth_filter.h" +void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z); + void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); @@ -38,10 +40,10 @@ av_cold void ff_fft_init_arm(FFTContext *s) { int cpu_flags = av_get_cpu_flags(); -if (have_vfp(cpu_flags)) { +if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) { +s->fft_calc = ff_fft_calc_vfp; #if CONFIG_MDCT -if (!have_vfpv3(cpu_flags)) -s->imdct_half = ff_imdct_half_vfp; +s->imdct_half = ff_imdct_half_vfp; #endif } diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S index 7845ebb..130d529 100644 --- a/libavcodec/arm/fft_vfp.S +++ b/libavcodec/arm/fft_vfp.S @@ -21,8 +21,39 @@ #include "libavutil/arm/asm.S" -@ TODO: * FFTs wider than 16 -@ * dispatch code +@ The fftx_internal_vfp versions of the functions obey a modified AAPCS: +@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and +@ all single-precision VFP registers may be corrupted on exit. The a2 +@ register may not be clobbered in these functions, as it holds the +@ stored original FPSCR. + +function ff_fft_calc_vfp, export=1 +ldr ip, [a1, #0]@ nbits +mov a1, a2 +A ldr pc, [pc, ip, lsl #2] +A .word 0 +A .word 0 +A .word 0 +T movrel a2, (fft_tab_vfp - 8) +T ldr pc, [a2, ip, lsl #2] +T endfunc +T const fft_tab_vfp +.word fft4_vfp +.word fft8_vfp +.word X(ff_fft16_vfp) @ this one alone is exported +.word fft32_vfp +.word fft64_vfp +.word fft128_vfp +.word fft256_vfp +.word fft512_vfp +.word fft1024_vfp +.word fft2048_vfp +.word fft4096_vfp +.word fft8192_vfp +.word fft16384_vfp +.word fft32768_vfp +.word fft65536_vfp +A endfunc function fft4_vfp vldrd0, [a1, #0*2*4] @ s0,s1 = z[0] @@ -131,18 +162,22 @@ endfunc vstrd9, [a1, #3 * 2*4] .endm +function .Lfft8_internal_vfp +macro_fft8_head +macro_fft8_tail +bx lr +endfunc + function fft8_vfp ldr a3, =0x0303 @ RunFast mode, vector length 4, stride 1 fmrxa2, FPSCR fmxrFPSCR, a3 vpush {s16-s31} - -macro_fft8_head -macro_fft8_tail - +mov ip, lr +bl .Lfft8_internal_vfp vpop{s16-s31} fmxrFPSCR, a2 -bx lr +bx ip endfunc .align 3 @@ -153,12 +188,7 @@ cos1pi8:@ cos(1*pi/8) = sqrt(2+sqrt(2))/2 cos3pi8:@ cos(2*pi/8) = sqrt(2-sqrt(2))/2 .float 0.3826834261417388916015625 -function ff_fft16_vfp, export=1 -ldr a3, =0x0303 @ RunFast mode, vector length 4, stride 1 -fmrxa2, FPSCR -fmxr
[FFmpeg-cvslog] armv6: Accelerate vector_fmul_window
ffmpeg | branch: master | Ben Avison | Fri Jul 11 00:12:33 2014 +0100| [5edad2c4a1f46bcc56be755af86ab355c2f1b37f] | committer: Martin Storsjö armv6: Accelerate vector_fmul_window I benchmarked the result by measuring the number of gperftools samples that hit anywhere in the AAC decoder (starting from aac_decode_frame()) or specifically in vector_fmul_window_c() / ff_vector_fmul_window_vfp() for the same sample AAC stream: Before After Mean StdDev Mean StdDev Confidence Change Audio decode1598.2 47.4 1529.2 25.4100.0% +4.5% vector_fmul_window 244.0 22.1 188.9 22.3100.0% +29.2% Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5edad2c4a1f46bcc56be755af86ab355c2f1b37f --- libavutil/arm/float_dsp_init_vfp.c |7 +- libavutil/arm/float_dsp_vfp.S | 204 2 files changed, 210 insertions(+), 1 deletion(-) diff --git a/libavutil/arm/float_dsp_init_vfp.c b/libavutil/arm/float_dsp_init_vfp.c index 31cb6ae..f44020e 100644 --- a/libavutil/arm/float_dsp_init_vfp.c +++ b/libavutil/arm/float_dsp_init_vfp.c @@ -26,12 +26,17 @@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len); +void ff_vector_fmul_window_vfp(float *dst, const float *src0, + const float *src1, const float *win, int len); + void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len); av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags) { -if (!have_vfpv3(cpu_flags)) +if (!have_vfpv3(cpu_flags)) { fdsp->vector_fmul = ff_vector_fmul_vfp; +fdsp->vector_fmul_window = ff_vector_fmul_window_vfp; +} fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; } diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S index 8295280..c25588f 100644 --- a/libavutil/arm/float_dsp_vfp.S +++ b/libavutil/arm/float_dsp_vfp.S @@ -68,6 +68,210 @@ function ff_vector_fmul_vfp, export=1 endfunc /** + * ARM VFP implementation of 'vector_fmul_window_c' function + * Assume that len is a positive non-zero number + */ +@ void ff_vector_fmul_window_vfp(float *dst, const float *src0, +@const float *src1, const float *win, int len) +function ff_vector_fmul_window_vfp, export=1 +DST0.reqa1 +SRC0.reqa2 +SRC1.reqa3 +WIN0.reqa4 +LEN .reqv1 +DST1.reqv2 +WIN1.reqv3 +OLDFPSCR .req ip + +push{v1-v3,lr} +ldr LEN, [sp, #4*4+0] +vpush {s16-s31} +fmrxOLDFPSCR, FPSCR +add DST1, DST0, LEN, lsl #3 +add SRC1, SRC1, LEN, lsl #2 +add WIN1, WIN0, LEN, lsl #3 + +tst LEN, #7 +beq 4f @ common case: len is a multiple of 8 + +ldr lr, =0x0300 @ RunFast mode, scalar mode +fmxrFPSCR, lr + +tst LEN, #1 +beq 1f +vldmdb WIN1!, {s0} +vldmia SRC0!, {s8} +vldmia WIN0!, {s16} +vmul.f s24, s0, s8 +vldmdb SRC1!, {s20} +vmul.f s8, s16, s8 +vmls.f s24, s16, s20 +vmla.f s8, s0, s20 +vstmia DST0!, {s24} +vstmdb DST1!, {s8} +1: +tst LEN, #2 +beq 2f +vldmdb WIN1!, {s0} +vldmdb WIN1!, {s1} +vldmia SRC0!, {s8-s9} +vldmia WIN0!, {s16-s17} +vmul.f s24, s0, s8 +vmul.f s25, s1, s9 +vldmdb SRC1!, {s20} +vldmdb SRC1!, {s21} +vmul.f s8, s16, s8 +vmul.f s9, s17, s9 +vmls.f s24, s16, s20 +vmls.f s25, s17, s21 +vmla.f s8, s0, s20 +vmla.f s9, s1, s21 +vstmia DST0!, {s24-s25} +vstmdb DST1!, {s8} +vstmdb DST1!, {s9} +2: +tst LEN, #4 +beq 3f +vldmdb WIN1!, {s0} +vldmdb WIN1!, {s1} +vldmdb WIN1!, {s2} +vldmdb WIN1!, {s3} +vldmia SRC0!, {s8-s11} +vldmia WIN0!, {s16-s19} +vmul.f s24, s0, s8 +vmul.f s25, s1, s9 +vmul.f s26, s2, s10 +vmul.f s27, s3, s11 +vldmdb SRC1!, {s20} +vldmdb SRC1!, {s21} +vldmdb SRC1!, {s22} +vldmdb SRC1!, {s23} +vmul.f s8, s16, s8 +vmul.f s9, s17, s9 +vmul.f s10, s18, s10 +vmul.f s11, s19, s11 +vmls.f s24, s16, s20 +vmls.f s25, s17, s21 +vmls.f s26, s18, s22 +vmls.f s27, s19, s23 +vmla.f s8, s0, s20 +vmla.f s9, s1, s21 +vmla.f s10, s2, s22 +vmla.f s11, s3, s23 +vstmia DST0!, {s24-s27} +vstmdb DST1!, {s8} +vstmdb DST1!, {s9} +vstmdb DST1!, {s10} +
[FFmpeg-cvslog] armv6: Accelerate butterflies_float
ffmpeg | branch: master | Ben Avison | Fri Jul 11 00:12:34 2014 +0100| [5a272190a04666f0fe41be767396b30712638c21] | committer: Martin Storsjö armv6: Accelerate butterflies_float I benchmarked the result by measuring the number of gperftools samples that hit anywhere in the AAC decoder (starting from aac_decode_frame()) or specifically in butterflies_float_c() / ff_butterflies_float_vfp() for the same sample AAC stream: Before After Mean StdDev Mean StdDev Confidence Change Audio decode 1542.8 43.7 1470.5 41.5100.0% +4.9% butterflies_float 130.0 11.9 70.2 12.1100.0% +85.2% Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5a272190a04666f0fe41be767396b30712638c21 --- libavutil/arm/float_dsp_init_vfp.c |4 ++ libavutil/arm/float_dsp_vfp.S | 116 2 files changed, 120 insertions(+) diff --git a/libavutil/arm/float_dsp_init_vfp.c b/libavutil/arm/float_dsp_init_vfp.c index f44020e..61ff2ed 100644 --- a/libavutil/arm/float_dsp_init_vfp.c +++ b/libavutil/arm/float_dsp_init_vfp.c @@ -32,6 +32,8 @@ void ff_vector_fmul_window_vfp(float *dst, const float *src0, void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len); +void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len); + av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags) { if (!have_vfpv3(cpu_flags)) { @@ -39,4 +41,6 @@ av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags) fdsp->vector_fmul_window = ff_vector_fmul_window_vfp; } fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; +if (!have_vfpv3(cpu_flags)) +fdsp->butterflies_float = ff_butterflies_float_vfp; } diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S index c25588f..9f920aa 100644 --- a/libavutil/arm/float_dsp_vfp.S +++ b/libavutil/arm/float_dsp_vfp.S @@ -339,3 +339,119 @@ function ff_vector_fmul_reverse_vfp, export=1 vpop{d8-d15} bx lr endfunc + +/** + * ARM VFP implementation of 'butterflies_float_c' function + * Assume that len is a positive non-zero number + */ +@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len) +function ff_butterflies_float_vfp, export=1 +BASE1 .reqa1 +BASE2 .reqa2 +LEN .reqa3 +OLDFPSCR .req a4 + +vpush {s16-s31} +fmrxOLDFPSCR, FPSCR + +tst LEN, #7 +beq 4f @ common case: len is a multiple of 8 + +ldr ip, =0x0300 @ RunFast mode, scalar mode +fmxrFPSCR, ip + +tst LEN, #1 +beq 1f +vldmia BASE1!, {s0} +vldmia BASE2!, {s8} +vadd.f s16, s0, s8 +vsub.f s24, s0, s8 +vstrs16, [BASE1, #0-4*1] +vstrs24, [BASE2, #0-4*1] +1: +tst LEN, #2 +beq 2f +vldmia BASE1!, {s0-s1} +vldmia BASE2!, {s8-s9} +vadd.f s16, s0, s8 +vadd.f s17, s1, s9 +vsub.f s24, s0, s8 +vsub.f s25, s1, s9 +vstrd8, [BASE1, #0-8*1]@ s16,s17 +vstrd12, [BASE2, #0-8*1] @ s24,s25 +2: +tst LEN, #4 +beq 3f +vldmia BASE1!, {s0-s1} +vldmia BASE2!, {s8-s9} +vldmia BASE1!, {s2-s3} +vldmia BASE2!, {s10-s11} +vadd.f s16, s0, s8 +vadd.f s17, s1, s9 +vsub.f s24, s0, s8 +vsub.f s25, s1, s9 +vadd.f s18, s2, s10 +vadd.f s19, s3, s11 +vsub.f s26, s2, s10 +vsub.f s27, s3, s11 +vstrd8, [BASE1, #0-16*1]@ s16,s17 +vstrd12, [BASE2, #0-16*1] @ s24,s25 +vstrd9, [BASE1, #8-16*1]@ s18,s19 +vstrd13, [BASE2, #8-16*1] @ s26,s27 +3: +bicsLEN, LEN, #7 +beq 7f +4: +ldr ip, =0x0303 @ RunFast mode, short vectors of length 4, stride 1 +fmxrFPSCR, ip + +vldmia BASE1!, {s0-s1} +vldmia BASE2!, {s8-s9} +vldmia BASE1!, {s2-s3} +vldmia BASE2!, {s10-s11} +vadd.f s16, s0, s8 +vldmia BASE1!, {s4-s5} +vldmia BASE2!, {s12-s13} +vldmia BASE1!, {s6-s7} +vldmia BASE2!, {s14-s15} +vsub.f s24, s0, s8 +vadd.f s20, s4, s12 +subsLEN, LEN, #8 +beq 6f +5: vldmia BASE1!, {s0-s3} +vldmia BASE2!, {s8-s11} +vsub.f s28, s4, s12 +vstrd8, [BASE1, #0-16*3]@ s16,s17 +vstrd9, [BASE1, #8-16*3]@ s18,s19 +vstrd12, [BASE2, #0-16*3] @ s24,s25 +vstrd13, [BASE2, #8-16*3] @ s26,s27 +
[FFmpeg-cvslog] arm: Macroize the test for 'setend' CPU instruction support
ffmpeg | branch: master | Ben Avison | Mon Jul 21 14:53:06 2014 +0100| [6869612f5c7d4d2f20f69a5658328a761deadb1c] | committer: Diego Biurrun arm: Macroize the test for 'setend' CPU instruction support Signed-off-by: Diego Biurrun > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=6869612f5c7d4d2f20f69a5658328a761deadb1c --- libavcodec/arm/h264dsp_init_arm.c |6 +- libavutil/arm/cpu.h |6 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c index f9712d8..7cb1312 100644 --- a/libavcodec/arm/h264dsp_init_arm.c +++ b/libavcodec/arm/h264dsp_init_arm.c @@ -104,12 +104,8 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, { int cpu_flags = av_get_cpu_flags(); -if (have_armv6(cpu_flags) && !(have_vfpv3(cpu_flags) || have_neon(cpu_flags))) { -// This function uses the 'setend' instruction which is deprecated -// on ARMv8. This instruction is serializing on some ARMv7 cores as -// well. Therefore, only use the function on ARMv6. +if (have_setend(cpu_flags)) c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6; -} if (have_neon(cpu_flags)) h264dsp_init_neon(c, bit_depth, chroma_format_idc); } diff --git a/libavutil/arm/cpu.h b/libavutil/arm/cpu.h index 52e839c..224409a 100644 --- a/libavutil/arm/cpu.h +++ b/libavutil/arm/cpu.h @@ -30,4 +30,10 @@ #define have_vfpv3(flags) CPUEXT(flags, VFPV3) #define have_neon(flags)CPUEXT(flags, NEON) +/* Some functions use the 'setend' instruction which is deprecated on ARMv8 + * and serializing on some ARMv7 cores. This macro ensures such functions + * are only enabled on ARMv6. */ +#define have_setend(flags) \ +(have_armv6(flags) && !(have_vfpv3(flags) || have_neon(flags))) + #endif /* AVUTIL_ARM_CPU_H */ ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog
[FFmpeg-cvslog] h264: Move start code search functions into separate source files.
ffmpeg | branch: master | Ben Avison | Mon Jul 21 16:25:48 2014 +0100| [db7f1c7c5a1d37e7f4da64a79a97bea1c4b6e9f8] | committer: Luca Barbato h264: Move start code search functions into separate source files. This permits re-use with parsers for codecs which use similar start codes. Signed-off-by: Luca Barbato > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=db7f1c7c5a1d37e7f4da64a79a97bea1c4b6e9f8 --- configure |3 +- libavcodec/Makefile|1 + libavcodec/arm/Makefile|2 +- libavcodec/arm/h264dsp_init_arm.c |5 +- libavcodec/arm/startcode.h | 26 + .../arm/{h264dsp_armv6.S => startcode_armv6.S} |4 +- libavcodec/h264_parser.c |2 +- libavcodec/h264dsp.c | 31 +-- libavcodec/h264dsp.h |2 +- libavcodec/startcode.c | 57 libavcodec/startcode.h | 26 + 11 files changed, 121 insertions(+), 38 deletions(-) diff --git a/configure b/configure index b9242e2..4fc1e6a 100755 --- a/configure +++ b/configure @@ -1578,6 +1578,7 @@ CONFIG_EXTRA=" rtpdec rtpenc_chain sinewin +startcode tpeldsp videodsp vp3dsp @@ -1794,7 +1795,7 @@ h263_decoder_select="error_resilience h263_parser h263dsp mpeg_er mpegvideo qpel h263_encoder_select="aandcttables h263dsp mpegvideoenc" h263i_decoder_select="h263_decoder" h263p_encoder_select="h263_encoder" -h264_decoder_select="cabac golomb h264chroma h264dsp h264pred h264qpel videodsp" +h264_decoder_select="cabac golomb h264chroma h264dsp h264pred h264qpel startcode videodsp" h264_decoder_suggest="error_resilience" hevc_decoder_select="bswapdsp cabac golomb videodsp" huffyuv_decoder_select="bswapdsp huffyuvdsp" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index a088a68..7d19e6e 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -79,6 +79,7 @@ OBJS-$(CONFIG_RANGECODER) += rangecoder.o RDFT-OBJS-$(CONFIG_HARDCODED_TABLES) += sin_tables.o OBJS-$(CONFIG_RDFT)+= rdft.o $(RDFT-OBJS-yes) OBJS-$(CONFIG_SINEWIN) += sinewin.o +OBJS-$(CONFIG_STARTCODE) += startcode.o OBJS-$(CONFIG_TPELDSP) += tpeldsp.o OBJS-$(CONFIG_VAAPI) += vaapi.o OBJS-$(CONFIG_VDA) += vda.o diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 742c3ee..6c2eb99 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -53,7 +53,6 @@ ARMV5TE-OBJS-$(CONFIG_VIDEODSP)+= arm/videodsp_init_armv5te.o \ ARMV5TE-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv5te.o ARMV6-OBJS-$(CONFIG_AC3DSP)+= arm/ac3dsp_armv6.o -ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ arm/hpeldsp_armv6.o ARMV6-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv6.o \ @@ -65,6 +64,7 @@ ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_armv6.o ARMV6-OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_armv6.o ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o +ARMV6-OBJS-$(CONFIG_STARTCODE) += arm/startcode_armv6.o ARMV6-OBJS-$(CONFIG_VP7_DECODER) += arm/vp8_armv6.o \ arm/vp8dsp_init_armv6.o \ arm/vp8dsp_armv6.o diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c index 7cb1312..7afd350 100644 --- a/libavcodec/arm/h264dsp_init_arm.c +++ b/libavcodec/arm/h264dsp_init_arm.c @@ -23,8 +23,7 @@ #include "libavutil/attributes.h" #include "libavutil/arm/cpu.h" #include "libavcodec/h264dsp.h" - -int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size); +#include "libavcodec/arm/startcode.h" void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); @@ -105,7 +104,7 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, int cpu_flags = av_get_cpu_flags(); if (have_setend(cpu_flags)) -c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6; +c->startcode_find_candidate = ff_startcode_find_candidate_armv6; if (have_neon(cpu_flags)) h264dsp_init_neon(c, bit_depth, chroma_format_idc); } diff --git a/libavcodec/arm/startcode.h b/libavcodec/arm/startcode.h new file mode 100644 index 000..d7996c1 --- /dev/null +++ b/
[FFmpeg-cvslog] vc-1: Add platform-specific start code search routine to VC1DSPContext.
ffmpeg | branch: master | Ben Avison | Mon Jul 21 14:53:08 2014 +0100| [adf8227cf4e7b4fccb2ad88e1e09b6dc00dd00ed] | committer: Luca Barbato vc-1: Add platform-specific start code search routine to VC1DSPContext. Initialise VC1DSPContext for parser as well as for decoder. Note, the VC-1 code doesn't actually use the function pointer yet. Signed-off-by: Luca Barbato > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=adf8227cf4e7b4fccb2ad88e1e09b6dc00dd00ed --- configure|4 ++-- libavcodec/Makefile |2 +- libavcodec/arm/vc1dsp_init_arm.c |3 +++ libavcodec/vc1.c |2 ++ libavcodec/vc1dec.c |1 - libavcodec/vc1dsp.c |3 +++ libavcodec/vc1dsp.h |8 7 files changed, 19 insertions(+), 4 deletions(-) diff --git a/configure b/configure index 4fc1e6a..b2eb0c8 100755 --- a/configure +++ b/configure @@ -1885,7 +1885,7 @@ twinvq_decoder_select="mdct lsp sinewin" utvideo_decoder_select="bswapdsp" utvideo_encoder_select="bswapdsp huffman huffyuvencdsp" vble_decoder_select="huffyuvdsp" -vc1_decoder_select="blockdsp error_resilience h263_decoder h264chroma h264qpel intrax8 mpeg_er qpeldsp" +vc1_decoder_select="blockdsp error_resilience h263_decoder h264chroma h264qpel intrax8 mpeg_er qpeldsp startcode" vc1image_decoder_select="vc1_decoder" vorbis_decoder_select="mdct" vorbis_encoder_select="mdct" @@ -1963,7 +1963,7 @@ wmv3_vdpau_hwaccel_select="vc1_vdpau_hwaccel" h264_parser_select="h264_decoder" mpegvideo_parser_select="mpegvideo" mpeg4video_parser_select="error_resilience h263dsp mpeg_er mpegvideo qpeldsp" -vc1_parser_select="mpegvideo" +vc1_parser_select="mpegvideo startcode" # external libraries libfaac_encoder_deps="libfaac" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 7d19e6e..d59bd1c 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -674,7 +674,7 @@ OBJS-$(CONFIG_PNM_PARSER) += pnm_parser.o pnm.o OBJS-$(CONFIG_RV30_PARSER) += rv34_parser.o OBJS-$(CONFIG_RV40_PARSER) += rv34_parser.o OBJS-$(CONFIG_TAK_PARSER) += tak_parser.o tak.o -OBJS-$(CONFIG_VC1_PARSER) += vc1_parser.o vc1.o vc1data.o \ +OBJS-$(CONFIG_VC1_PARSER) += vc1_parser.o vc1.o vc1data.o vc1dsp.o \ msmpeg4.o msmpeg4data.o mpeg4video.o \ h263.o OBJS-$(CONFIG_VORBIS_PARSER) += vorbis_parser.o xiph.o diff --git a/libavcodec/arm/vc1dsp_init_arm.c b/libavcodec/arm/vc1dsp_init_arm.c index 6d4eb79..a6a97c8 100644 --- a/libavcodec/arm/vc1dsp_init_arm.c +++ b/libavcodec/arm/vc1dsp_init_arm.c @@ -20,6 +20,7 @@ #include "libavutil/attributes.h" #include "libavutil/arm/cpu.h" +#include "libavcodec/arm/startcode.h" #include "libavcodec/vc1dsp.h" #include "vc1dsp.h" @@ -27,6 +28,8 @@ av_cold void ff_vc1dsp_init_arm(VC1DSPContext *dsp) { int cpu_flags = av_get_cpu_flags(); +if (have_setend(cpu_flags)) +dsp->startcode_find_candidate = ff_startcode_find_candidate_armv6; if (have_neon(cpu_flags)) ff_vc1dsp_init_neon(dsp); } diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c index 1978b08..cef0fe6 100644 --- a/libavcodec/vc1.c +++ b/libavcodec/vc1.c @@ -1688,5 +1688,7 @@ av_cold int ff_vc1_init_common(VC1Context *v) v->pq = -1; v->mvrange = 0; /* 7.1.1.18, p80 */ +ff_vc1dsp_init(&v->vc1dsp); + return 0; } diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c index c83bb4f..f7f6a9f 100644 --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -5629,7 +5629,6 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx) ff_blockdsp_init(&s->bdsp, avctx); ff_h264chroma_init(&v->h264chroma, 8); ff_qpeldsp_init(&s->qdsp); -ff_vc1dsp_init(&v->vc1dsp); if (avctx->codec_id == AV_CODEC_ID_WMV3 || avctx->codec_id == AV_CODEC_ID_WMV3IMAGE) { int count = 0; diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c index 3b92eb2..a193dd7 100644 --- a/libavcodec/vc1dsp.c +++ b/libavcodec/vc1dsp.c @@ -29,6 +29,7 @@ #include "h264chroma.h" #include "qpeldsp.h" #include "vc1dsp.h" +#include "startcode.h" /* Apply overlap transform to horizontal edge */ static void vc1_v_overlap_c(uint8_t *src, int stride) @@ -948,6 +949,8 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp) dsp->sprite_v_double_twoscale = sprite_v_double_twoscale_c; #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */ +dsp->startcode_find_candidate = ff_startcode_find_candidate_c; + if (ARCH_AARCH64) ff_vc1dsp
[FFmpeg-cvslog] vc-1: Optimise parser (with special attention to ARM)
ffmpeg | branch: master | Ben Avison | Mon Jul 21 14:53:09 2014 +0100| [701e8b42e12ad625c64ceae2252acb1de390278c] | committer: Luca Barbato vc-1: Optimise parser (with special attention to ARM) The previous implementation of the parser made four passes over each input buffer (reduced to two if the container format already guaranteed the input buffer corresponded to frames, such as with MKV). But these buffers are often 200K in size, certainly enough to flush the data out of L1 cache, and for many CPUs, all the way out to main memory. The passes were: 1) locate frame boundaries (not needed for MKV etc) 2) copy the data into a contiguous block (not needed for MKV etc) 3) locate the start codes within each frame 4) unescape the data between start codes After this, the unescaped data was parsed to extract certain header fields, but because the unescape operation was so large, this was usually also effectively operating on uncached memory. Most of the unescaped data was simply thrown away and never processed further. Only step 2 - because it used memcpy - was using prefetch, making things even worse. This patch reorganises these steps so that, aside from the copying, the operations are performed in parallel, maximising cache utilisation. No more than the worst-case number of bytes needed for header parsing is unescaped. Most of the data is, in practice, only read in order to search for a start code, for which optimised implementations already existed in the H264 codec (notably the ARM version uses prefetch, so we end up doing both remaining passes at maximum speed). For MKV files, we know when we've found the last start code of interest in a given frame, so we are able to avoid doing even that one remaining pass for most of the buffer. In some use-cases (such as the Raspberry Pi) video decode is handled by the GPU, but the entire elementary stream is still fed through the parser to pick out certain elements of the header which are necessary to manage the decode process. As you might expect, in these cases, the performance of the parser is significant. To measure parser performance, I used the same VC-1 elementary stream in either an MPEG-2 transport stream or a MKV file, and fed it through avconv with -c:v copy -c:a copy -f null. These are the gperftools counts for those streams, both filtered to only include vc1_parse() and its callees, and unfiltered (to include the whole binary). Lower numbers are better: Before After File Filtered Mean StdDev Mean StdDev Confidence Change M2TS No861.7 8.2 650.5 8.1 100.0% +32.5% MKV No868.9 7.4 731.7 9.0 100.0% +18.8% M2TS Yes 250.0 11.2 27.2 3.4 100.0% +817.9% MKV Yes 149.0 12.8 1.70.8 100.0% +8526.3% Yes, that last case shows vc1_parse() running 86 times faster! The M2TS case does show a larger absolute improvement though, since it was worse to begin with. This patch has been tested with the FATE suite (albeit on x86 for speed). Signed-off-by: Luca Barbato > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=701e8b42e12ad625c64ceae2252acb1de390278c --- libavcodec/vc1_parser.c | 276 ++- 1 file changed, 175 insertions(+), 101 deletions(-) diff --git a/libavcodec/vc1_parser.c b/libavcodec/vc1_parser.c index 1bedd98..43ca0ed 100644 --- a/libavcodec/vc1_parser.c +++ b/libavcodec/vc1_parser.c @@ -30,117 +30,84 @@ #include "vc1.h" #include "get_bits.h" +/** The maximum number of bytes of a sequence, entry point or + * frame header whose values we pay any attention to */ +#define UNESCAPED_THRESHOLD 37 + +/** The maximum number of bytes of a sequence, entry point or + * frame header which must be valid memory (because they are + * used to update the bitstream cache in skip_bits() calls) + */ +#define UNESCAPED_LIMIT 144 + +typedef enum { +NO_MATCH, +ONE_ZERO, +TWO_ZEROS, +ONE +} VC1ParseSearchState; + typedef struct { ParseContext pc; VC1Context v; +uint8_t prev_start_code; +size_t bytes_to_skip; +uint8_t unesc_buffer[UNESCAPED_LIMIT]; +size_t unesc_index; +VC1ParseSearchState search_state; } VC1ParseContext; -static void vc1_extract_headers(AVCodecParserContext *s, AVCodecContext *avctx, -const uint8_t *buf, int buf_size) +static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx, + const uint8_t *buf, int buf_size) { +/* Parse the header we just finished unescaping */ VC1ParseContext *vpc = s->priv_data; GetBitContext gb; -const uint8_t *start, *end, *next; -uint8_t *buf2 = av_mallocz(buf_size + FF_INPUT_BUFFER_PADDING_SIZE); - vpc->v.s.avctx = avctx; vpc->v.parse_only = 1; -next = buf; -s->repeat_pict = 0; - -for(start = buf, end = buf + buf_size;
[FFmpeg-cvslog] checkasm: Add vc1dsp in-loop deblocking filter tests
ffmpeg | branch: master | Ben Avison | Thu Mar 31 18:23:42 2022 +0100| [20cb43ea8ba0471dcba442b8de8fa17ff41f6281] | committer: Martin Storsjö checkasm: Add vc1dsp in-loop deblocking filter tests Note that the benchmarking results for these functions are highly dependent upon the input data. Therefore, each function is benchmarked twice, corresponding to the best and worst case complexity of the reference C implementation. The performance of a real stream decode will fall somewhere between these two extremes. Signed-off-by: Ben Avison Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=20cb43ea8ba0471dcba442b8de8fa17ff41f6281 --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/vc1dsp.c | 102 ++ tests/fate/checkasm.mak | 1 + 5 files changed, 108 insertions(+) diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index f768b1144e..7133a6ee66 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -11,6 +11,7 @@ AVCODECOBJS-$(CONFIG_H264PRED) += h264pred.o AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o +AVCODECOBJS-$(CONFIG_VC1DSP)+= vc1dsp.o AVCODECOBJS-$(CONFIG_VP8DSP)+= vp8dsp.o AVCODECOBJS-$(CONFIG_VIDEODSP) += videodsp.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 748d6a9f3a..c2efd81b6d 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -147,6 +147,9 @@ static const struct { #if CONFIG_V210_ENCODER { "v210enc", checkasm_check_v210enc }, #endif +#if CONFIG_VC1DSP +{ "vc1dsp", checkasm_check_vc1dsp }, +#endif #if CONFIG_VP8DSP { "vp8dsp", checkasm_check_vp8dsp }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index c3192d8c23..52ab18a5b1 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -78,6 +78,7 @@ void checkasm_check_sw_scale(void); void checkasm_check_utvideodsp(void); void checkasm_check_v210dec(void); void checkasm_check_v210enc(void); +void checkasm_check_vc1dsp(void); void checkasm_check_vf_eq(void); void checkasm_check_vf_gblur(void); void checkasm_check_vf_hflip(void); diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c new file mode 100644 index 00..2fd6c74d6c --- /dev/null +++ b/tests/checkasm/vc1dsp.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2022 Ben Avison + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + +#include "checkasm.h" + +#include "libavcodec/vc1dsp.h" + +#include "libavutil/common.h" +#include "libavutil/internal.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + +#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) }, + +typedef struct { +const char *name; +size_t offset; +} test; + +#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size) \ +do {\ +uint8_t *p##0 = name##0, *p##1 = name##1; \ +int i = (size); \ +while (i-- > 0) { \ +int x = 0x80 | (rnd() & 0x7F); \ +x >>= rnd() % 9;\ +if (rnd() & 1) \ +x = -x; \ +*p##1++ = *p##0++ = 0x80 + x; \ +} \ +} while (0) + +static void check_loop_filter(void) +{ +/* Deblocking filter buffers are big enough to hold a 16x16 block, + * plus 16 columns left and 4 rows above to hold filter inputs + * (depending on whether v or h neighbouring block edge, oversized + * horizontally to maintain 16-byte alignment) plus 16 columns and + * 4 rows below to catch write overflows */ +LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]); +LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]); + +
[FFmpeg-cvslog] checkasm: Add vc1dsp inverse transform tests
ffmpeg | branch: master | Ben Avison | Thu Mar 31 18:23:43 2022 +0100| [2698bfdc93d456d304a38b570052e1a238d64c54] | committer: Martin Storsjö checkasm: Add vc1dsp inverse transform tests This test deliberately doesn't exercise the full range of inputs described in the committee draft VC-1 standard. It says: input coefficients in frequency domain, D, satisfy -2048 <= D < 2047 intermediate coefficients, E, satisfy-4096 <= E < 4095 fully inverse-transformed coefficients, R, satisfy-512 <= R < 511 For one thing, the inequalities look odd. Did they mean them to go the other way round? That would make more sense because the equations generally both add and subtract coefficients multiplied by constants, including powers of 2. Requiring the most-negative values to be valid extends the number of bits to represent the intermediate values just for the sake of that one case! For another thing, the extreme values don't look to occur in real streams - both in my experience and supported by the following comment in the AArch32 decoder: tNhalf is half of the value of tN (as described in vc1_inv_trans_8x8_c). This is done because sometimes files have input that causes tN + tM to overflow. To avoid this overflow, we compute tNhalf, then compute tNhalf + tM (which doesn't overflow), and then we use vhadd to compute (tNhalf + (tNhalf + tM)) >> 1 which does not overflow because it is one instruction. My AArch64 decoder goes further than this. It calculates tNhalf and tM then does an SRA (essentially a fused halve and add) to compute (tN + tM) >> 1 without ever having to hold (tNhalf + tM) in a 16-bit element without overflowing. It only encounters difficulties if either tNhalf or tM overflow in isolation. I haven't had sight of the final standard, so it's possible that these issues were dealt with during finalisation, which could explain the lack of usage of extreme inputs in real streams. Or a preponderance of decoders that only support 16-bit intermediate values in their inverse transforms might have caused encoders to steer clear of such cases. I have effectively followed this approach in the test, and limited the scale of the coefficients sufficient that both the existing AArch32 decoder and my new AArch64 decoder both pass. Signed-off-by: Ben Avison Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2698bfdc93d456d304a38b570052e1a238d64c54 --- tests/checkasm/vc1dsp.c | 283 1 file changed, 283 insertions(+) diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c index 2fd6c74d6c..7d4457306f 100644 --- a/tests/checkasm/vc1dsp.c +++ b/tests/checkasm/vc1dsp.c @@ -30,12 +30,208 @@ #include "libavutil/mem_internal.h" #define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) }, +#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height }, typedef struct { const char *name; size_t offset; +int width; +int height; } test; +typedef struct matrix { +size_t width; +size_t height; +float d[]; +} matrix; + +static const matrix T8 = { 8, 8, { +12, 12, 12, 12, 12, 12, 12, 12, +16, 15, 9, 4, -4, -9, -15, -16, +16, 6, -6, -16, -16, -6, 6, 16, +15, -4, -16, -9, 9, 16, 4, -15, +12, -12, -12, 12, 12, -12, -12, 12, + 9, -16, 4, 15, -15, -4, 16, -9, + 6, -16, 16, -6, -6, 16, -16, 6, + 4, -9, 15, -16, 16, -15, 9, -4 +} }; + +static const matrix T4 = { 4, 4, { +17, 17, 17, 17, +22, 10, -10, -22, +17, -17, -17, 17, +10, -22, 22, -10 +} }; + +static const matrix T8t = { 8, 8, { +12, 16, 16, 15, 12, 9, 6, 4, +12, 15, 6, -4, -12, -16, -16, -9, +12, 9, -6, -16, -12, 4, 16, 15, +12, 4, -16, -9, 12, 15, -6, -16, +12, -4, -16, 9, 12, -15, -6, 16, +12, -9, -6, 16, -12, -4, 16, -15, +12, -15, 6, 4, -12, 16, -16, 9, +12, -16, 16, -15, 12, -9, 6, -4 +} }; + +static const matrix T4t = { 4, 4, { +17, 22, 17, 10, +17, 10, -17, -22, +17, -10, -17, 22, +17, -22, 17, -10 +} }; + +static matrix *new_matrix(size_t width, size_t height) +{ +matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float)); +if (out == NULL) { +fprintf(stderr, "Memory allocation failure\n"); +exit(EXIT_FAILURE); +} +out->width = width; +out->height = height; +return out; +} + +static matrix *multiply(const matrix *a, const matrix *b) +{ +matrix *out; +if (a->width != b->height) { +fprintf(stderr, "Incompatible multiplication\n"); +exit(EXIT_FAILURE); +} +o
[FFmpeg-cvslog] checkasm: Add idctdsp add/put-pixels-clamped tests
ffmpeg | branch: master | Ben Avison | Thu Mar 31 18:23:44 2022 +0100| [bd3615a81a3387cafb51444927e852423f8f4a6e] | committer: Martin Storsjö checkasm: Add idctdsp add/put-pixels-clamped tests Signed-off-by: Ben Avison Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bd3615a81a3387cafb51444927e852423f8f4a6e --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/idctdsp.c | 98 +++ tests/fate/checkasm.mak | 1 + 5 files changed, 104 insertions(+) diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 7133a6ee66..f6b1008855 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -9,6 +9,7 @@ AVCODECOBJS-$(CONFIG_G722DSP) += g722dsp.o AVCODECOBJS-$(CONFIG_H264DSP) += h264dsp.o AVCODECOBJS-$(CONFIG_H264PRED) += h264pred.o AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o +AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o AVCODECOBJS-$(CONFIG_VC1DSP)+= vc1dsp.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index c2efd81b6d..57134f96ea 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -123,6 +123,9 @@ static const struct { #if CONFIG_HUFFYUV_DECODER { "huffyuvdsp", checkasm_check_huffyuvdsp }, #endif +#if CONFIG_IDCTDSP +{ "idctdsp", checkasm_check_idctdsp }, +#endif #if CONFIG_JPEG2000_DECODER { "jpeg2000dsp", checkasm_check_jpeg2000dsp }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 52ab18a5b1..a86db140e3 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -64,6 +64,7 @@ void checkasm_check_hevc_idct(void); void checkasm_check_hevc_pel(void); void checkasm_check_hevc_sao(void); void checkasm_check_huffyuvdsp(void); +void checkasm_check_idctdsp(void); void checkasm_check_jpeg2000dsp(void); void checkasm_check_llviddsp(void); void checkasm_check_llviddspenc(void); diff --git a/tests/checkasm/idctdsp.c b/tests/checkasm/idctdsp.c new file mode 100644 index 00..02724536a7 --- /dev/null +++ b/tests/checkasm/idctdsp.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2022 Ben Avison + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + +#include "checkasm.h" + +#include "libavcodec/idctdsp.h" + +#include "libavutil/common.h" +#include "libavutil/internal.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + +#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) }, + +typedef struct { +const char *name; +size_t offset; +} test; + +#define RANDOMIZE_BUFFER16(name, size) \ +do {\ +int i; \ +for (i = 0; i < size; ++i) {\ +uint16_t r = rnd() % 0x201 - 0x100; \ +AV_WN16A(name##0 + i, r); \ +AV_WN16A(name##1 + i, r); \ +} \ +} while (0) + +#define RANDOMIZE_BUFFER8(name, size) \ +do { \ +int i;\ +for (i = 0; i < size; ++i) { \ +uint8_t r = rnd();\ +name##0[i] = r; \ +name##1[i] = r; \ +} \ +} while (0) + +static void check_add_put_clamped(void) +{ +/* Source buffers are only as big as needed, since any over-read won't affect results */ +LOCAL_ALIGNED_16(int16_t, src0, [64]); +LOCAL_ALIGNED_16(int16_t, src1, [64]); +/* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */ +LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]); +LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]); + +AVCodecContext avctx = { 0 }; +IDCTDSPContext h; + +c
[FFmpeg-cvslog] avcodec/vc1: Introduce fast path for unescaping bitstream buffer
ffmpeg | branch: master | Ben Avison | Thu Mar 31 18:23:45 2022 +0100| [2e268477802d64aa75b9c3c2cb2fc89d1ef7c87d] | committer: Martin Storsjö avcodec/vc1: Introduce fast path for unescaping bitstream buffer Includes a checkasm test. Signed-off-by: Ben Avison Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2e268477802d64aa75b9c3c2cb2fc89d1ef7c87d --- libavcodec/vc1dec.c | 20 +++ libavcodec/vc1dsp.c | 2 ++ libavcodec/vc1dsp.h | 3 +++ tests/checkasm/vc1dsp.c | 67 + 4 files changed, 82 insertions(+), 10 deletions(-) diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c index e279ffd1c1..0426e8a752 100644 --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -491,7 +491,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx) size = next - start - 4; if (size <= 0) continue; -buf2_size = vc1_unescape_buffer(start + 4, size, buf2); +buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); init_get_bits(&gb, buf2, buf2_size * 8); switch (AV_RB32(start)) { case VC1_CODE_SEQHDR: @@ -681,7 +681,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, case VC1_CODE_FRAME: if (avctx->hwaccel) buf_start = start; -buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); +buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); break; case VC1_CODE_FIELD: { int buf_size3; @@ -698,8 +698,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, ret = AVERROR(ENOMEM); goto err; } -buf_size3 = vc1_unescape_buffer(start + 4, size, -slices[n_slices].buf); +buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, + slices[n_slices].buf); init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, buf_size3 << 3); slices[n_slices].mby_start = avctx->coded_height + 31 >> 5; @@ -710,7 +710,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, break; } case VC1_CODE_ENTRYPOINT: /* it should be before frame data */ -buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); +buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); init_get_bits(&s->gb, buf2, buf_size2 * 8); ff_vc1_decode_entry_point(avctx, v, &s->gb); break; @@ -727,8 +727,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, ret = AVERROR(ENOMEM); goto err; } -buf_size3 = vc1_unescape_buffer(start + 4, size, -slices[n_slices].buf); +buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, + slices[n_slices].buf); init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, buf_size3 << 3); slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9); @@ -762,7 +762,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, ret = AVERROR(ENOMEM); goto err; } -buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); +buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, buf_size3 << 3); slices[n_slices].mby_start = s->mb_height + 1 >> 1; @@ -771,9 +771,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, n_slices1 = n_slices - 1; n_slices++; } -buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2); +buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2); } else { -buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2); +buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2); } init_get_bits(&s->gb, buf2, buf_size2*8); } else{ diff --git a/libavcodec
[FFmpeg-cvslog] avcodec/vc1: Arm 64-bit NEON deblocking filter fast paths
ffmpeg | branch: master | Ben Avison | Thu Mar 31 18:23:46 2022 +0100| [c62bbd4d2015ffa717369e687601fb2d481af6b0] | committer: Martin Storsjö avcodec/vc1: Arm 64-bit NEON deblocking filter fast paths checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. Note that the C version can still outperform the NEON version in specific cases. The balance between different code paths is stream-dependent, but in practice the best case happens about 5% of the time, the worst case happens about 40% of the time, and the complexity of the remaining cases fall somewhere in between. Therefore, taking the average of the best and worst case timings is probably a conservative estimate of the degree by which the NEON code improves performance. vc1dsp.vc1_h_loop_filter4_bestcase_c: 10.7 vc1dsp.vc1_h_loop_filter4_bestcase_neon: 43.5 vc1dsp.vc1_h_loop_filter4_worstcase_c: 184.5 vc1dsp.vc1_h_loop_filter4_worstcase_neon: 73.7 vc1dsp.vc1_h_loop_filter8_bestcase_c: 31.2 vc1dsp.vc1_h_loop_filter8_bestcase_neon: 62.2 vc1dsp.vc1_h_loop_filter8_worstcase_c: 358.2 vc1dsp.vc1_h_loop_filter8_worstcase_neon: 88.2 vc1dsp.vc1_h_loop_filter16_bestcase_c: 51.0 vc1dsp.vc1_h_loop_filter16_bestcase_neon: 107.7 vc1dsp.vc1_h_loop_filter16_worstcase_c: 722.7 vc1dsp.vc1_h_loop_filter16_worstcase_neon: 140.5 vc1dsp.vc1_v_loop_filter4_bestcase_c: 9.7 vc1dsp.vc1_v_loop_filter4_bestcase_neon: 43.0 vc1dsp.vc1_v_loop_filter4_worstcase_c: 178.7 vc1dsp.vc1_v_loop_filter4_worstcase_neon: 69.0 vc1dsp.vc1_v_loop_filter8_bestcase_c: 30.2 vc1dsp.vc1_v_loop_filter8_bestcase_neon: 50.7 vc1dsp.vc1_v_loop_filter8_worstcase_c: 353.0 vc1dsp.vc1_v_loop_filter8_worstcase_neon: 69.2 vc1dsp.vc1_v_loop_filter16_bestcase_c: 60.0 vc1dsp.vc1_v_loop_filter16_bestcase_neon: 90.0 vc1dsp.vc1_v_loop_filter16_worstcase_c: 714.2 vc1dsp.vc1_v_loop_filter16_worstcase_neon: 97.2 Signed-off-by: Ben Avison Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c62bbd4d2015ffa717369e687601fb2d481af6b0 --- libavcodec/aarch64/Makefile | 1 + libavcodec/aarch64/vc1dsp_init_aarch64.c | 14 + libavcodec/aarch64/vc1dsp_neon.S | 692 +++ 3 files changed, 707 insertions(+) diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 954461f81d..5b25e4dfb9 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -48,6 +48,7 @@ NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o NEON-OBJS-$(CONFIG_MDCT)+= aarch64/mdct_neon.o NEON-OBJS-$(CONFIG_MPEGAUDIODSP)+= aarch64/mpegaudiodsp_neon.o NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o +NEON-OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_neon.o NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o # decoders/encoders diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c index 13dfd74940..8f96e4802d 100644 --- a/libavcodec/aarch64/vc1dsp_init_aarch64.c +++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c @@ -25,6 +25,13 @@ #include "config.h" +void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); +void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); +void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); +void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); +void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq); +void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq); + void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y); void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, @@ -39,6 +46,13 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) int cpu_flags = av_get_cpu_flags(); if (have_neon(cpu_flags)) { +dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; +dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; +dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; +dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; +dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; +dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; + dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon; dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S new file mode 100644 index 00..1ea9fa75ff --- /dev/null +++ b/libavcodec/aarch64/vc1dsp_neon.S @@ -0,0 +1,692 @@ +/* + * VC1 AArch64 NEON optimisations + * + * Copyright (c) 2022 Ben Avison + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it u
[FFmpeg-cvslog] avcodec/vc1: Arm 32-bit NEON deblocking filter fast paths
ffmpeg | branch: master | Ben Avison | Thu Mar 31 18:23:47 2022 +0100| [c07de58a725a508c628ddea7d936771c42c189aa] | committer: Martin Storsjö avcodec/vc1: Arm 32-bit NEON deblocking filter fast paths checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. Note that the C version can still outperform the NEON version in specific cases. The balance between different code paths is stream-dependent, but in practice the best case happens about 5% of the time, the worst case happens about 40% of the time, and the complexity of the remaining cases fall somewhere in between. Therefore, taking the average of the best and worst case timings is probably a conservative estimate of the degree by which the NEON code improves performance. vc1dsp.vc1_h_loop_filter4_bestcase_c: 19.0 vc1dsp.vc1_h_loop_filter4_bestcase_neon: 48.5 vc1dsp.vc1_h_loop_filter4_worstcase_c: 144.7 vc1dsp.vc1_h_loop_filter4_worstcase_neon: 76.2 vc1dsp.vc1_h_loop_filter8_bestcase_c: 41.0 vc1dsp.vc1_h_loop_filter8_bestcase_neon: 75.0 vc1dsp.vc1_h_loop_filter8_worstcase_c: 294.0 vc1dsp.vc1_h_loop_filter8_worstcase_neon: 102.7 vc1dsp.vc1_h_loop_filter16_bestcase_c: 54.7 vc1dsp.vc1_h_loop_filter16_bestcase_neon: 130.0 vc1dsp.vc1_h_loop_filter16_worstcase_c: 569.7 vc1dsp.vc1_h_loop_filter16_worstcase_neon: 186.7 vc1dsp.vc1_v_loop_filter4_bestcase_c: 20.2 vc1dsp.vc1_v_loop_filter4_bestcase_neon: 47.2 vc1dsp.vc1_v_loop_filter4_worstcase_c: 164.2 vc1dsp.vc1_v_loop_filter4_worstcase_neon: 68.5 vc1dsp.vc1_v_loop_filter8_bestcase_c: 43.5 vc1dsp.vc1_v_loop_filter8_bestcase_neon: 55.2 vc1dsp.vc1_v_loop_filter8_worstcase_c: 316.2 vc1dsp.vc1_v_loop_filter8_worstcase_neon: 72.7 vc1dsp.vc1_v_loop_filter16_bestcase_c: 62.2 vc1dsp.vc1_v_loop_filter16_bestcase_neon: 103.7 vc1dsp.vc1_v_loop_filter16_worstcase_c: 646.5 vc1dsp.vc1_v_loop_filter16_worstcase_neon: 110.7 Signed-off-by: Ben Avison Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c07de58a725a508c628ddea7d936771c42c189aa --- libavcodec/arm/vc1dsp_init_neon.c | 14 + libavcodec/arm/vc1dsp_neon.S | 643 ++ 2 files changed, 657 insertions(+) diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c index 2cca784f5a..f5f5c702d7 100644 --- a/libavcodec/arm/vc1dsp_init_neon.c +++ b/libavcodec/arm/vc1dsp_init_neon.c @@ -32,6 +32,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq); +void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq); +void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq); +void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq); +void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq); +void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq); + void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int rnd); @@ -92,6 +99,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; +dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; +dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; +dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; +dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; +dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; +dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; + dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon; FN_ASSIGN(1, 0); FN_ASSIGN(2, 0); diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S index 93f043bf08..ba54221ef6 100644 --- a/libavcodec/arm/vc1dsp_neon.S +++ b/libavcodec/arm/vc1dsp_neon.S @@ -1161,3 +1161,646 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1 vst1.32 {d1[1]}, [r0,:32] bx lr endfunc + +@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks +@ On entry: +@ r0 -> top-left pel of lower block +@ r1 = row stride, bytes +@ r2 = PQUANT bitstream parameter +function ff_vc1_v_loop_filter4_neon, export=1 +sub r3, r0, r1, lsl #2 +vldrd0, .Lcoeffs +vld1.32 {d1[0]}, [r0], r1 @ P5 +vld1.32 {d2[0]}, [r3], r1 @ P1 +vld1.32 {d3[0]}, [r3], r1 @ P2 +vld1.32 {d4[0]}, [r0], r1 @ P6 +vld1.32 {d5[0]}, [r3], r1 @ P3 +vld1.32 {d6[0]}, [r0], r1 @ P7 +vld1.32 {d7[0]}, [r3] @ P4 +vld1.32 {d16[0]}, [r0]
[FFmpeg-cvslog] avcodec/vc1: Arm 64-bit NEON inverse transform fast paths
ffmpeg | branch: master | Ben Avison | Thu Mar 31 18:23:48 2022 +0100| [501fdc017deb1b57ecc17420ba41686a14932fcc] | committer: Martin Storsjö avcodec/vc1: Arm 64-bit NEON inverse transform fast paths checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. vc1dsp.vc1_inv_trans_4x4_c: 158.2 vc1dsp.vc1_inv_trans_4x4_neon: 65.7 vc1dsp.vc1_inv_trans_4x4_dc_c: 86.5 vc1dsp.vc1_inv_trans_4x4_dc_neon: 26.5 vc1dsp.vc1_inv_trans_4x8_c: 335.2 vc1dsp.vc1_inv_trans_4x8_neon: 106.2 vc1dsp.vc1_inv_trans_4x8_dc_c: 151.2 vc1dsp.vc1_inv_trans_4x8_dc_neon: 25.5 vc1dsp.vc1_inv_trans_8x4_c: 365.7 vc1dsp.vc1_inv_trans_8x4_neon: 97.2 vc1dsp.vc1_inv_trans_8x4_dc_c: 139.7 vc1dsp.vc1_inv_trans_8x4_dc_neon: 16.5 vc1dsp.vc1_inv_trans_8x8_c: 547.7 vc1dsp.vc1_inv_trans_8x8_neon: 137.0 vc1dsp.vc1_inv_trans_8x8_dc_c: 268.2 vc1dsp.vc1_inv_trans_8x8_dc_neon: 30.5 Signed-off-by: Ben Avison Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=501fdc017deb1b57ecc17420ba41686a14932fcc --- libavcodec/aarch64/vc1dsp_init_aarch64.c | 19 + libavcodec/aarch64/vc1dsp_neon.S | 678 +++ 2 files changed, 697 insertions(+) diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c index 8f96e4802d..e0eb52dd63 100644 --- a/libavcodec/aarch64/vc1dsp_init_aarch64.c +++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c @@ -25,6 +25,16 @@ #include "config.h" +void ff_vc1_inv_trans_8x8_neon(int16_t *block); +void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); + +void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); + void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); @@ -46,6 +56,15 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) int cpu_flags = av_get_cpu_flags(); if (have_neon(cpu_flags)) { +dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon; +dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon; +dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon; +dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon; +dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon; +dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; +dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon; +dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; + dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S index 1ea9fa75ff..0201db4f78 100644 --- a/libavcodec/aarch64/vc1dsp_neon.S +++ b/libavcodec/aarch64/vc1dsp_neon.S @@ -22,7 +22,685 @@ #include "libavutil/aarch64/asm.S" +// VC-1 8x8 inverse transform +// On entry: +// x0 -> array of 16-bit inverse transform coefficients, in column-major order +// On exit: +// array at x0 updated to hold transformed block; also now held in row-major order +function ff_vc1_inv_trans_8x8_neon, export=1 +ld1 {v1.16b, v2.16b}, [x0], #32 +ld1 {v3.16b, v4.16b}, [x0], #32 +ld1 {v5.16b, v6.16b}, [x0], #32 +shl v1.8h, v1.8h, #2// 8/2 * src[0] +sub x1, x0, #3*32 +ld1 {v16.16b, v17.16b}, [x0] +shl v7.8h, v2.8h, #4// 16 * src[8] +shl v18.8h, v2.8h, #2 // 4 * src[8] +shl v19.8h, v4.8h, #4 //16 * src[24] +ldr d0, .Lcoeffs_it8 +shl v5.8h, v5.8h, #2// 8/2 * src[32] +shl v20.8h, v6.8h, #4 // 16 * src[40] +shl v21.8h, v6.8h, #2 // 4 * src[40] +shl v22.8h, v17.8h, #4 // 16 * src[56] +ssrav20.8h, v19.8h, #2 // 4 * src[24] + 16 * src[40] +mul v23.8h, v3.8h, v0.h[0] // 6/2
[FFmpeg-cvslog] avcodec/vc1: Arm 32-bit NEON unescape fast path
ffmpeg | branch: master | Ben Avison | Thu Mar 31 18:23:51 2022 +0100| [23c92e14f5fdb0c2928b44bb94d4c0711439e1c7] | committer: Martin Storsjö avcodec/vc1: Arm 32-bit NEON unescape fast path checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. vc1dsp.vc1_unescape_buffer_c: 918624.7 vc1dsp.vc1_unescape_buffer_neon: 142958.0 Signed-off-by: Ben Avison Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=23c92e14f5fdb0c2928b44bb94d4c0711439e1c7 --- libavcodec/arm/vc1dsp_init_neon.c | 61 libavcodec/arm/vc1dsp_neon.S | 118 ++ 2 files changed, 179 insertions(+) diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c index f5f5c702d7..48cb816b70 100644 --- a/libavcodec/arm/vc1dsp_init_neon.c +++ b/libavcodec/arm/vc1dsp_init_neon.c @@ -19,6 +19,7 @@ #include #include "libavutil/attributes.h" +#include "libavutil/intreadwrite.h" #include "libavcodec/vc1dsp.h" #include "vc1dsp.h" @@ -84,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y); +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); + +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) +{ +/* Dealing with starting and stopping, and removing escape bytes, are + * comparatively less time-sensitive, so are more clearly expressed using + * a C wrapper around the assembly inner loop. Note that we assume a + * little-endian machine that supports unaligned loads. */ +int dsize = 0; +while (size >= 4) +{ +int found = 0; +while (!found && (((uintptr_t) dst) & 7) && size >= 4) +{ +found = (AV_RL32(src) &~ 0x0300) == 0x0003; +if (!found) +{ +*dst++ = *src++; +--size; +++dsize; +} +} +if (!found) +{ +int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); +dst += skip; +src += skip; +size -= skip; +dsize += skip; +while (!found && size >= 4) +{ +found = (AV_RL32(src) &~ 0x0300) == 0x0003; +if (!found) +{ +*dst++ = *src++; +--size; +++dsize; +} +} +} +if (found) +{ +*dst++ = *src++; +*dst++ = *src++; +++src; +size -= 3; +dsize += 2; +} +} +while (size > 0) +{ +*dst++ = *src++; +--size; +++dsize; +} +return dsize; +} + #define FN_ASSIGN(X, Y) \ dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \ dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon @@ -130,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; + +dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; } diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S index ba54221ef6..96014fbebc 100644 --- a/libavcodec/arm/vc1dsp_neon.S +++ b/libavcodec/arm/vc1dsp_neon.S @@ -1804,3 +1804,121 @@ function ff_vc1_h_loop_filter16_neon, export=1 4: vpop{d8-d15} pop {r4-r6,pc} endfunc + +@ Copy at most the specified number of bytes from source to destination buffer, +@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence +@ On entry: +@ r0 -> source buffer +@ r1 = max number of bytes to copy +@ r2 -> destination buffer, optimally 8-byte aligned +@ On exit: +@ r0 = number of bytes not copied +function ff_vc1_unescape_buffer_helper_neon, export=1 +@ Offset by 48 to screen out cases that are too short for us to handle, +@ and also make it easy to test for loop termination, or to determine +@ whether we need an odd number of half-iterations of the loop. +subsr1, r1, #48 +bmi 90f + +@ Set up useful constants +vmov.i32q0, #0x300 +vmov.i32q1, #0x3 + +tst r1, #16 +bne 1f + + vld1.8 {q8, q9}, [r0]! + vbicq12, q8, q0 + vext.8 q13, q8, q9, #1 + vext.8 q14, q8, q9, #2 +
[FFmpeg-cvslog] avcodec/idctdsp: Arm 64-bit NEON block add and clamp fast paths
ffmpeg | branch: master | Ben Avison | Thu Mar 31 18:23:49 2022 +0100| [5379412ed0c587d82788c6fc46b7787cfe10f72d] | committer: Martin Storsjö avcodec/idctdsp: Arm 64-bit NEON block add and clamp fast paths checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. idctdsp.add_pixels_clamped_c: 313.3 idctdsp.add_pixels_clamped_neon: 24.3 idctdsp.put_pixels_clamped_c: 220.3 idctdsp.put_pixels_clamped_neon: 15.5 idctdsp.put_signed_pixels_clamped_c: 210.5 idctdsp.put_signed_pixels_clamped_neon: 19.5 Signed-off-by: Ben Avison Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5379412ed0c587d82788c6fc46b7787cfe10f72d --- libavcodec/aarch64/Makefile | 3 +- libavcodec/aarch64/idctdsp_init_aarch64.c | 26 -- libavcodec/aarch64/idctdsp_neon.S | 130 ++ 3 files changed, 150 insertions(+), 9 deletions(-) diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 5b25e4dfb9..c8935f205e 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -44,7 +44,8 @@ NEON-OBJS-$(CONFIG_H264PRED)+= aarch64/h264pred_neon.o NEON-OBJS-$(CONFIG_H264QPEL)+= aarch64/h264qpel_neon.o \ aarch64/hpeldsp_neon.o NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o -NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o +NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \ + aarch64/simple_idct_neon.o NEON-OBJS-$(CONFIG_MDCT)+= aarch64/mdct_neon.o NEON-OBJS-$(CONFIG_MPEGAUDIODSP)+= aarch64/mpegaudiodsp_neon.o NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c index 742a3372e3..eec21aa5a2 100644 --- a/libavcodec/aarch64/idctdsp_init_aarch64.c +++ b/libavcodec/aarch64/idctdsp_init_aarch64.c @@ -27,19 +27,29 @@ #include "libavcodec/idctdsp.h" #include "idct.h" +void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); +void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); +void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); + av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { int cpu_flags = av_get_cpu_flags(); -if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) { -if (avctx->idct_algo == FF_IDCT_AUTO || -avctx->idct_algo == FF_IDCT_SIMPLEAUTO || -avctx->idct_algo == FF_IDCT_SIMPLENEON) { -c->idct_put = ff_simple_idct_put_neon; -c->idct_add = ff_simple_idct_add_neon; -c->idct = ff_simple_idct_neon; -c->perm_type = FF_IDCT_PERM_PARTTRANS; +if (have_neon(cpu_flags)) { +if (!avctx->lowres && !high_bit_depth) { +if (avctx->idct_algo == FF_IDCT_AUTO || +avctx->idct_algo == FF_IDCT_SIMPLEAUTO || +avctx->idct_algo == FF_IDCT_SIMPLENEON) { +c->idct_put = ff_simple_idct_put_neon; +c->idct_add = ff_simple_idct_add_neon; +c->idct = ff_simple_idct_neon; +c->perm_type = FF_IDCT_PERM_PARTTRANS; +} } + +c->add_pixels_clamped= ff_add_pixels_clamped_neon; +c->put_pixels_clamped= ff_put_pixels_clamped_neon; +c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; } } diff --git a/libavcodec/aarch64/idctdsp_neon.S b/libavcodec/aarch64/idctdsp_neon.S new file mode 100644 index 00..7f47611206 --- /dev/null +++ b/libavcodec/aarch64/idctdsp_neon.S @@ -0,0 +1,130 @@ +/* + * IDCT AArch64 NEON optimisations + * + * Copyright (c) 2022 Ben Avison + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +// Clamp 16-bit signed block coefficients to unsigned 8-bit +//
[FFmpeg-cvslog] avcodec/vc1: Arm 64-bit NEON unescape fast path
ffmpeg | branch: master | Ben Avison | Thu Mar 31 18:23:50 2022 +0100| [6eee65028957c3b16287a204e648caebcc86b06c] | committer: Martin Storsjö avcodec/vc1: Arm 64-bit NEON unescape fast path checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. vc1dsp.vc1_unescape_buffer_c: 655617.7 vc1dsp.vc1_unescape_buffer_neon: 118237.0 Signed-off-by: Ben Avison Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=6eee65028957c3b16287a204e648caebcc86b06c --- libavcodec/aarch64/vc1dsp_init_aarch64.c | 61 +++ libavcodec/aarch64/vc1dsp_neon.S | 176 +++ 2 files changed, 237 insertions(+) diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c index e0eb52dd63..a7976fd596 100644 --- a/libavcodec/aarch64/vc1dsp_init_aarch64.c +++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c @@ -21,6 +21,7 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/aarch64/cpu.h" +#include "libavutil/intreadwrite.h" #include "libavcodec/vc1dsp.h" #include "config.h" @@ -51,6 +52,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y); +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); + +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) +{ +/* Dealing with starting and stopping, and removing escape bytes, are + * comparatively less time-sensitive, so are more clearly expressed using + * a C wrapper around the assembly inner loop. Note that we assume a + * little-endian machine that supports unaligned loads. */ +int dsize = 0; +while (size >= 4) +{ +int found = 0; +while (!found && (((uintptr_t) dst) & 7) && size >= 4) +{ +found = (AV_RL32(src) &~ 0x0300) == 0x0003; +if (!found) +{ +*dst++ = *src++; +--size; +++dsize; +} +} +if (!found) +{ +int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); +dst += skip; +src += skip; +size -= skip; +dsize += skip; +while (!found && size >= 4) +{ +found = (AV_RL32(src) &~ 0x0300) == 0x0003; +if (!found) +{ +*dst++ = *src++; +--size; +++dsize; +} +} +} +if (found) +{ +*dst++ = *src++; +*dst++ = *src++; +++src; +size -= 3; +dsize += 2; +} +} +while (size > 0) +{ +*dst++ = *src++; +--size; +++dsize; +} +return dsize; +} + av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) { int cpu_flags = av_get_cpu_flags(); @@ -76,5 +135,7 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; + +dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; } } diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S index 0201db4f78..9a96c2523c 100644 --- a/libavcodec/aarch64/vc1dsp_neon.S +++ b/libavcodec/aarch64/vc1dsp_neon.S @@ -1368,3 +1368,179 @@ function ff_vc1_h_loop_filter16_neon, export=1 st2 {v2.b, v3.b}[7], [x6] 4: ret endfunc + +// Copy at most the specified number of bytes from source to destination buffer, +// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence +// On entry: +// x0 -> source buffer +// w1 = max number of bytes to copy +// x2 -> destination buffer, optimally 8-byte aligned +// On exit: +// w0 = number of bytes not copied +function ff_vc1_unescape_buffer_helper_neon, export=1 +// Offset by 80 to screen out cases that are too short for us to handle, +// and also make it easy to test for loop termination, or to determine +// whether we need an odd number of half-iterations of the loop. +subsw1, w1, #80 +b.mi90f + +// Set up useful constants +moviv20.4s, #3, lsl #24 +moviv21.4s, #3, lsl #16 + +tst w1, #32 +b.ne1f + + ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48 + ext v25.16b, v0.16b, v1.16b