The previous implementation targeted DTS Coherent Acoustics, which only requires nbits == 4 (fft16()). This case was (and still is) linked directly rather than being indirected through ff_fft_calc_vfp(), but now the full range from radix-4 up to radix-65536 is available. This benefits other codecs such as AAC and AC3.
The implementaion is based upon the C version, with each routine larger than radix-16 calling a hierarchy of smaller FFT functions, then performing a post-processing pass. This pass benefits a lot from loop unrolling to counter the long pipelines in the VFP. A relaxed calling standard also reduces the overhead of the call hierarchy, and avoiding the excessive inlining performed by GCC probably helps with I-cache utilisation too. I benchmarked the result by measuring the number of gperftools samples that hit anywhere in the AAC decoder (starting from aac_decode_frame()) or specifically in the FFT routines (fft4() to fft512() and pass()) for the same sample AAC stream: Before After Mean StdDev Mean StdDev Confidence Change Audio decode 2245.5 53.1 1599.6 43.8 100.0% +40.4% FFT routines 940.6 22.0 348.1 20.8 100.0% +170.2% --- libavcodec/arm/fft_init_arm.c | 8 +- libavcodec/arm/fft_vfp.S | 261 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 252 insertions(+), 17 deletions(-) diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c index 7e49b9c..5087f5f 100644 --- a/libavcodec/arm/fft_init_arm.c +++ b/libavcodec/arm/fft_init_arm.c @@ -23,6 +23,8 @@ #include "libavcodec/rdft.h" #include "libavcodec/synth_filter.h" +void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z); + void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); @@ -38,10 +40,10 @@ av_cold void ff_fft_init_arm(FFTContext *s) { int cpu_flags = av_get_cpu_flags(); - if (have_vfp(cpu_flags)) { + if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) { + s->fft_calc = ff_fft_calc_vfp; #if CONFIG_MDCT - if (!have_vfpv3(cpu_flags)) - s->imdct_half = ff_imdct_half_vfp; + s->imdct_half = ff_imdct_half_vfp; #endif } diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S index f1ab37c..65498a5 100644 --- a/libavcodec/arm/fft_vfp.S +++ b/libavcodec/arm/fft_vfp.S @@ -21,8 +21,9 @@ #include "libavutil/arm/asm.S" -@ TODO: * FFTs wider than 16 -@ * dispatch code +@ The fftx_internal_vfp versions of the functions obey a modified AAPCS: +@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and +@ all single-precision VFP registers may be corrupted on exit. function fft4_vfp vldr d0, [a1, #0*2*4] @ s0,s1 = z[0] @@ -131,18 +132,22 @@ endfunc vstr d9, [a1, #3 * 2*4] .endm +function fft8_internal_vfp + macro_fft8_head + macro_fft8_tail + bx lr +endfunc + function fft8_vfp ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 fmrx a2, FPSCR fmxr FPSCR, a3 vpush {s16-s31} - - macro_fft8_head - macro_fft8_tail - + mov ip, lr + bl fft8_internal_vfp vpop {s16-s31} fmxr FPSCR, a2 - bx lr + bx ip endfunc .align 3 @@ -153,12 +158,7 @@ cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2 cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2 .float 0.3826834261417388916015625 -function ff_fft16_vfp, export=1 - ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 - fmrx a2, FPSCR - fmxr FPSCR, a3 - vpush {s16-s31} - +function fft16_internal_vfp macro_fft8_head @ FFT4(z+8) vldr d10, [a1, #8 * 2*4] @@ -292,7 +292,240 @@ function ff_fft16_vfp, export=1 vstr d8, [a1, #0 * 2*4] vstr d9, [a1, #4 * 2*4] + bx lr +endfunc + +function ff_fft16_vfp, export=1 + ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 + fmrx a2, FPSCR + fmxr FPSCR, a3 + vpush {s16-s31} + mov ip, lr + bl fft16_internal_vfp vpop {s16-s31} fmxr FPSCR, a2 - bx lr + bx ip +endfunc + +.macro pass n, z0, z1, z2, z3 + add v6, v5, #4*2*\n + @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]) + @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) + @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]) + @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) + vldr d8, [\z2, #8*(o2+1)] @ s16,s17 + vldmdb v6!, {s2} + vldr d9, [\z3, #8*(o3+1)] @ s18,s19 + vldmia v5!, {s0,s1} @ s0 is unused + vldr s7, [\z2, #8*o2] @ t1 + vmul.f s20, s16, s2 @ vector * scalar + vldr s0, [\z3, #8*o3] @ t5 + vldr s6, [\z2, #8*o2+4] @ t2 + vldr s3, [\z3, #8*o3+4] @ t6 + vmul.f s16, s16, s1 @ vector * scalar + ldr a4, =\n-1 +1: add \z0, \z0, #8*2 + .if \n*4*2 >= 512 + add \z1, \z1, #8*2 + .endif + .if \n*4*2 >= 256 + add \z2, \z2, #8*2 + .endif + .if \n*4*2 >= 512 + add \z3, \z3, #8*2 + .endif + @ up to 2 stalls (VFP vector issuing / waiting for s0) + @ depending upon whether this is the first iteration and + @ how many add instructions are inserted above + vadd.f s4, s0, s7 @ t5 + vadd.f s5, s6, s3 @ t6 + vsub.f s6, s6, s3 @ t4 + vsub.f s7, s0, s7 @ t3 + vldr d6, [\z0, #8*0-8*2] @ s12,s13 + vadd.f s0, s16, s21 @ t1 + vldr d7, [\z1, #8*o1-8*2] @ s14,s15 + vsub.f s1, s18, s23 @ t5 + vadd.f s8, s4, s12 @ vector + vector + @ stall (VFP vector issuing) + @ stall (VFP vector issuing) + @ stall (VFP vector issuing) + vsub.f s4, s12, s4 + vsub.f s5, s13, s5 + vsub.f s6, s14, s6 + vsub.f s7, s15, s7 + vsub.f s2, s17, s20 @ t2 + vadd.f s3, s19, s22 @ t6 + vstr d4, [\z0, #8*0-8*2] @ s8,s9 + vstr d5, [\z1, #8*o1-8*2] @ s10,s11 + @ stall (waiting for s5) + vstr d2, [\z2, #8*o2-8*2] @ s4,s5 + vadd.f s4, s1, s0 @ t5 + vstr d3, [\z3, #8*o3-8*2] @ s6,s7 + vsub.f s7, s1, s0 @ t3 + vadd.f s5, s2, s3 @ t6 + vsub.f s6, s2, s3 @ t4 + vldr d6, [\z0, #8*1-8*2] @ s12,s13 + vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15 + vldr d4, [\z2, #8*o2] @ s8,s9 + vldmdb v6!, {s2,s3} + vldr d5, [\z3, #8*o3] @ s10,s11 + vadd.f s20, s4, s12 @ vector + vector + vldmia v5!, {s0,s1} + vldr d8, [\z2, #8*(o2+1)] @ s16,s17 + @ stall (VFP vector issuing) + vsub.f s4, s12, s4 + vsub.f s5, s13, s5 + vsub.f s6, s14, s6 + vsub.f s7, s15, s7 + vmul.f s12, s8, s3 @ vector * scalar + vstr d10, [\z0, #8*1-8*2] @ s20,s21 + vldr d9, [\z3, #8*(o3+1)] @ s18,s19 + vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23 + vmul.f s8, s8, s0 @ vector * scalar + vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5 + @ stall (waiting for s7) + vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7 + vmul.f s20, s16, s2 @ vector * scalar + @ stall (VFP vector issuing) + @ stall (VFP vector issuing) + @ stall (VFP vector issuing) + vadd.f s7, s8, s13 @ t1 + vsub.f s6, s9, s12 @ t2 + vsub.f s0, s10, s15 @ t5 + vadd.f s3, s11, s14 @ t6 + vmul.f s16, s16, s1 @ vector * scalar + subs a4, a4, #1 + bne 1b + @ What remains is identical to the first two indentations of + @ the above, but without the increment of z + vadd.f s4, s0, s7 @ t5 + vadd.f s5, s6, s3 @ t6 + vsub.f s6, s6, s3 @ t4 + vsub.f s7, s0, s7 @ t3 + vldr d6, [\z0, #8*0] @ s12,s13 + vadd.f s0, s16, s21 @ t1 + vldr d7, [\z1, #8*o1] @ s14,s15 + vsub.f s1, s18, s23 @ t5 + vadd.f s8, s4, s12 @ vector + vector + vsub.f s4, s12, s4 + vsub.f s5, s13, s5 + vsub.f s6, s14, s6 + vsub.f s7, s15, s7 + vsub.f s2, s17, s20 @ t2 + vadd.f s3, s19, s22 @ t6 + vstr d4, [\z0, #8*0] @ s8,s9 + vstr d5, [\z1, #8*o1] @ s10,s11 + vstr d2, [\z2, #8*o2] @ s4,s5 + vadd.f s4, s1, s0 @ t5 + vstr d3, [\z3, #8*o3] @ s6,s7 + vsub.f s7, s1, s0 @ t3 + vadd.f s5, s2, s3 @ t6 + vsub.f s6, s2, s3 @ t4 + vldr d6, [\z0, #8*1] @ s12,s13 + vldr d7, [\z1, #8*(o1+1)] @ s14,s15 + vadd.f s20, s4, s12 @ vector + vector + vsub.f s4, s12, s4 + vsub.f s5, s13, s5 + vsub.f s6, s14, s6 + vsub.f s7, s15, s7 + vstr d10, [\z0, #8*1] @ s20,s21 + vstr d11, [\z1, #8*(o1+1)] @ s22,s23 + vstr d2, [\z2, #8*(o2+1)] @ s4,s5 + vstr d3, [\z3, #8*(o3+1)] @ s6,s7 +.endm + +.macro fft_internal_vfp name, name2, name4, costable, n +function \name + .if \n >= 512 + push {v1-v6,lr} + .elseif \n >= 256 + push {v1-v2,v5-v6,lr} + .else + push {v1,v5-v6,lr} + .endif + mov v1, a1 + bl \name2 + add a1, v1, #8*(\n/4)*2 + bl \name4 + ldr v5, =\costable + add a1, v1, #8*(\n/4)*3 + bl \name4 + .if \n >= 512 + .set o1, 0*(\n/4/2) + .set o2, 0*(\n/4/2) + .set o3, 0*(\n/4/2) + add v2, v1, #8*2*(\n/4/2) + add v3, v1, #8*4*(\n/4/2) + add v4, v1, #8*6*(\n/4/2) + pass (\n/4/2), v1, v2, v3, v4 + pop {v1-v6,pc} + .elseif \n >= 256 + .set o1, 2*(\n/4/2) + .set o2, 0*(\n/4/2) + .set o3, 2*(\n/4/2) + add v2, v1, #8*4*(\n/4/2) + pass (\n/4/2), v1, v1, v2, v2 + pop {v1-v2,v5-v6,pc} + .else + .set o1, 2*(\n/4/2) + .set o2, 4*(\n/4/2) + .set o3, 6*(\n/4/2) + pass (\n/4/2), v1, v1, v1, v1 + pop {v1,v5-v6,pc} + .endif +endfunc +.endm + +#define DECL_FFT(n,n2,n4) \ +fft_internal_vfp fft##n##_internal_vfp, fft##n2##_internal_vfp, fft##n4##_internal_vfp, ff_cos_##n, n ;\ + ;\ +function fft##n##_vfp ;\ + ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ ;\ + fmrx a2, FPSCR ;\ + fmxr FPSCR, a3 ;\ + vpush {s16-s31} ;\ + mov ip, lr ;\ + bl fft##n##_internal_vfp ;\ + vpop {s16-s31} ;\ + fmxr FPSCR, a2 ;\ + bx ip ;\ +endfunc ;\ + ;\ +.ltorg + +DECL_FFT(32,16,8) +DECL_FFT(64,32,16) +DECL_FFT(128,64,32) +DECL_FFT(256,128,64) +DECL_FFT(512,256,128) +DECL_FFT(1024,512,256) +DECL_FFT(2048,1024,512) +DECL_FFT(4096,2048,1024) +DECL_FFT(8192,4096,2048) +DECL_FFT(16384,8192,4096) +DECL_FFT(32768,16384,8192) +DECL_FFT(65536,32768,16384) + +function ff_fft_calc_vfp, export=1 + ldr ip, [a1, #0] @ nbits + mov a1, a2 + ldr ip, [pc, ip, lsl #2] + bx ip + .word 0 + .word 0 + .word fft4_vfp + .word fft8_vfp + .word ff_fft16_vfp @ this one alone is exported + .word fft32_vfp + .word fft64_vfp + .word fft128_vfp + .word fft256_vfp + .word fft512_vfp + .word fft1024_vfp + .word fft2048_vfp + .word fft4096_vfp + .word fft8192_vfp + .word fft16384_vfp + .word fft32768_vfp + .word fft65536_vfp endfunc -- 1.7.5.4 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel