On Wed, Feb 25, 2015 at 02:29:11PM -0300, James Almer wrote: > On 25/02/15 12:43 PM, Clément Bœsch wrote: > > On Tue, Feb 24, 2015 at 10:05:24PM -0300, James Almer wrote: > >> Signed-off-by: James Almer <jamr...@gmail.com> > >> --- > >> I decided to go the configure route since other features (cmov, clz) also > >> do > >> it , but if prefered this could instead be done with a new intmath.h > >> header > >> in the x86/ folder containing something like > >> > >> #if defined(__GNUC__) && defined(__POPCNT__) > >> #define av_popcount __builtin_popcount > >> #if ARCH_X86_64 > >> #define av_popcount64 __builtin_popcountll > >> #endif > >> #endif > >> > >> For a cleaner compile time check. > >> > >> configure | 12 ++++++++++-- > >> libavutil/intmath.h | 13 +++++++++++++ > >> 2 files changed, 23 insertions(+), 2 deletions(-) > >> > > > > For the record, the builtin implementation looks like this here: > > > > 0000000000000000 <av_popcount_c>: > > 0: 89 f8 mov %edi,%eax > > 2: d1 e8 shr %eax > > 4: 25 55 55 55 55 and $0x55555555,%eax > > 9: 29 c7 sub %eax,%edi > > b: 89 fa mov %edi,%edx > > d: c1 ef 02 shr $0x2,%edi > > 10: 81 e2 33 33 33 33 and $0x33333333,%edx > > 16: 81 e7 33 33 33 33 and $0x33333333,%edi > > 1c: 8d 04 17 lea (%rdi,%rdx,1),%eax > > 1f: 89 c2 mov %eax,%edx > > 21: c1 ea 04 shr $0x4,%edx > > 24: 01 d0 add %edx,%eax > > 26: 25 0f 0f 0f 0f and $0xf0f0f0f,%eax > > 2b: 89 c2 mov %eax,%edx > > 2d: c1 ea 08 shr $0x8,%edx > > 30: 01 d0 add %edx,%eax > > 32: 89 c2 mov %eax,%edx > > 34: c1 ea 10 shr $0x10,%edx > > 37: 01 d0 add %edx,%eax > > 39: 83 e0 3f and $0x3f,%eax > > 3c: c3 retq > > 3d: 0f 1f 00 nopl (%rax) > > > > 0000000000000040 <popcount_gcc>: > > 40: 48 83 ec 08 sub $0x8,%rsp > > 44: 89 ff mov %edi,%edi > > 46: e8 00 00 00 00 callq 4b <popcount_gcc+0xb> > > 4b: 48 83 c4 08 add $0x8,%rsp > > 4f: c3 retq > > > > 0000000000000040 <popcount_clang>: > > 40: 89 f8 mov %edi,%eax > > 42: d1 e8 shr %eax > > 44: 25 55 55 55 55 and $0x55555555,%eax > > 49: 29 c7 sub %eax,%edi > > 4b: 89 f8 mov %edi,%eax > > 4d: 25 33 33 33 33 and $0x33333333,%eax > > 52: c1 ef 02 shr $0x2,%edi > > 55: 81 e7 33 33 33 33 and $0x33333333,%edi > > 5b: 01 c7 add %eax,%edi > > 5d: 89 f8 mov %edi,%eax > > 5f: c1 e8 04 shr $0x4,%eax > > 62: 01 f8 add %edi,%eax > > 64: 25 0f 0f 0f 0f and $0xf0f0f0f,%eax > > 69: 69 c0 01 01 01 01 imul $0x1010101,%eax,%eax > > 6f: c1 e8 18 shr $0x18,%eax > > 72: c3 retq > > > > We might see relevant "optimizations" for our reference code. > > What's clang code for av_popcount64_c, or their builtin?
0000000000000000 <popcount64_clang>: 0: 48 89 f8 mov rax,rdi 3: 48 d1 e8 shr rax,1 6: 48 b9 55 55 55 55 55 movabs rcx,0x5555555555555555 d: 55 55 55 10: 48 21 c1 and rcx,rax 13: 48 29 cf sub rdi,rcx 16: 48 b8 33 33 33 33 33 movabs rax,0x3333333333333333 1d: 33 33 33 20: 48 89 f9 mov rcx,rdi 23: 48 21 c1 and rcx,rax 26: 48 c1 ef 02 shr rdi,0x2 2a: 48 21 c7 and rdi,rax 2d: 48 01 cf add rdi,rcx 30: 48 89 f8 mov rax,rdi 33: 48 c1 e8 04 shr rax,0x4 37: 48 01 f8 add rax,rdi 3a: 48 b9 0f 0f 0f 0f 0f movabs rcx,0xf0f0f0f0f0f0f0f 41: 0f 0f 0f 44: 48 21 c1 and rcx,rax 47: 48 b8 01 01 01 01 01 movabs rax,0x101010101010101 4e: 01 01 01 51: 48 0f af c1 imul rax,rcx 55: 48 c1 e8 38 shr rax,0x38 59: c3 ret 5a: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0] 0000000000000060 <av_popcount64_c>: 60: 89 f8 mov eax,edi 62: d1 e8 shr eax,1 64: 25 55 55 55 55 and eax,0x55555555 69: 89 f9 mov ecx,edi 6b: 29 c1 sub ecx,eax 6d: 89 c8 mov eax,ecx 6f: 25 33 33 33 33 and eax,0x33333333 74: c1 e9 02 shr ecx,0x2 77: 81 e1 33 33 33 33 and ecx,0x33333333 7d: 01 c1 add ecx,eax 7f: 89 c8 mov eax,ecx 81: c1 e8 04 shr eax,0x4 84: 01 c8 add eax,ecx 86: 25 0f 0f 0f 0f and eax,0xf0f0f0f 8b: 89 c1 mov ecx,eax 8d: c1 e9 08 shr ecx,0x8 90: 01 c1 add ecx,eax 92: 89 c8 mov eax,ecx 94: c1 e8 10 shr eax,0x10 97: 01 c8 add eax,ecx 99: 83 e0 3f and eax,0x3f 9c: 48 89 f9 mov rcx,rdi 9f: 48 c1 e9 20 shr rcx,0x20 a3: 48 c1 ef 21 shr rdi,0x21 a7: 81 e7 55 55 55 55 and edi,0x55555555 ad: 29 f9 sub ecx,edi af: 89 ca mov edx,ecx b1: 81 e2 33 33 33 33 and edx,0x33333333 b7: c1 e9 02 shr ecx,0x2 ba: 81 e1 33 33 33 33 and ecx,0x33333333 c0: 01 d1 add ecx,edx c2: 89 ca mov edx,ecx c4: c1 ea 04 shr edx,0x4 c7: 01 ca add edx,ecx c9: 81 e2 0f 0f 0f 0f and edx,0xf0f0f0f cf: 89 d1 mov ecx,edx d1: c1 e9 08 shr ecx,0x8 d4: 01 d1 add ecx,edx d6: 89 ca mov edx,ecx d8: c1 ea 10 shr edx,0x10 db: 01 ca add edx,ecx dd: 83 e2 3f and edx,0x3f e0: 01 d0 add eax,edx e2: c3 ret > We're currently calling av_popcount_c twice from within av_popcount64_c, > when on x86_64 cpus we could probably take advantage of the 64bits gprs. > -- Clément B.
pgp1l4qMy_hVW.pgp
Description: PGP signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel