Re: [FFmpeg-devel] [PATCH 2/2] all: do standards compliant absdiff computation

Ganesh Ajjanagadde Sun, 23 Aug 2015 14:56:43 -0700

On Sun, Aug 23, 2015 at 5:41 PM, Clément Bœsch <[email protected]> wrote:
> On Sun, Aug 23, 2015 at 05:33:11PM -0400, Ganesh Ajjanagadde wrote:
>> On Sun, Aug 23, 2015 at 5:28 PM, Clément Bœsch <[email protected]> wrote:
>> > On Sun, Aug 23, 2015 at 11:23:54PM +0200, Clément Bœsch wrote:
>> >> On Sun, Aug 23, 2015 at 11:58:23AM -0400, Ganesh Ajjanagadde wrote:
>> >> [...]
>> >> > diff --git a/libavfilter/vf_hqx.c b/libavfilter/vf_hqx.c
>> >> > index fa15d9c..0178793 100644
>> >> > --- a/libavfilter/vf_hqx.c
>> >> > +++ b/libavfilter/vf_hqx.c
>> >> > @@ -65,9 +65,9 @@ static av_always_inline int yuv_diff(uint32_t yuv1, 
>> >> > uint32_t yuv2)
>> >> >  #define YMASK 0xff0000
>> >> >  #define UMASK 0x00ff00
>> >> >  #define VMASK 0x0000ff
>> >> > -    return abs((yuv1 & YMASK) - (yuv2 & YMASK)) > (48 << 16) ||
>> >> > -           abs((yuv1 & UMASK) - (yuv2 & UMASK)) > ( 7 <<  8) ||
>> >> > -           abs((yuv1 & VMASK) - (yuv2 & VMASK)) > ( 6 <<  0);
>> >> > +    return FFUABSDIFF(yuv1 & YMASK, yuv2 & YMASK) > (48 << 16) ||
>> >> > +           FFUABSDIFF(yuv1 & UMASK, yuv2 & UMASK) > ( 7 <<  8) ||
>> >> > +           FFUABSDIFF(yuv1 & VMASK, yuv2 & VMASK) > ( 6 <<  0);
>> >>
>> >> This is one of the bottleneck function of the filter. How does it affect
>> >> speed? Can you compare the generated ASM?
>> >>
>> >
>> > To answer my second question:
>> >
>> > [/tmp]☭ cat a.c
>> > #include <stdlib.h>
>> > #include <stdint.h>
>> >
>> > #define YMASK 0xff0000
>> > #define UMASK 0x00ff00
>> > #define VMASK 0x0000ff
>> >
>> > #define FFUABSDIFF(a,b) (((a) > (b)) ? ((a)-(b)) : ((b)-(a)))
>> >
>> > int yuv_diff_0(uint32_t yuv1, uint32_t yuv2)
>> > {
>> >     return abs((yuv1 & YMASK) - (yuv2 & YMASK)) > (48 << 16) ||
>> >            abs((yuv1 & UMASK) - (yuv2 & UMASK)) > ( 7 <<  8) ||
>> >            abs((yuv1 & VMASK) - (yuv2 & VMASK)) > ( 6 <<  0);
>> > }
>> >
>> > int yuv_diff_1(uint32_t yuv1, uint32_t yuv2)
>> > {
>> >     return FFUABSDIFF(yuv1 & YMASK, yuv2 & YMASK) > (48 << 16) ||
>> >            FFUABSDIFF(yuv1 & UMASK, yuv2 & UMASK) > ( 7 <<  8) ||
>> >            FFUABSDIFF(yuv1 & VMASK, yuv2 & VMASK) > ( 6 <<  0);
>> > }
>> > [/tmp]☭ gcc -Wall -O2 -c a.c && objdump -d -Mintel a.o
>> >
>> > a.o:     file format elf64-x86-64
>> >
>> >
>> > Disassembly of section .text:
>> >
>> > 0000000000000000 <yuv_diff_0>:
>> >    0:   89 f8                   mov    eax,edi
>> >    2:   89 f2                   mov    edx,esi
>> >    4:   81 e2 00 00 ff 00       and    edx,0xff0000
>> >    a:   25 00 00 ff 00          and    eax,0xff0000
>> >    f:   29 d0                   sub    eax,edx
>> >   11:   99                      cdq
>> >   12:   31 d0                   xor    eax,edx
>> >   14:   29 d0                   sub    eax,edx
>> >   16:   89 c2                   mov    edx,eax
>> >   18:   b8 01 00 00 00          mov    eax,0x1
>> >   1d:   81 fa 00 00 30 00       cmp    edx,0x300000
>> >   23:   7f 3e                   jg     63 <yuv_diff_0+0x63>
>> >   25:   89 fa                   mov    edx,edi
>> >   27:   89 f1                   mov    ecx,esi
>> >   29:   81 e1 00 ff 00 00       and    ecx,0xff00
>> >   2f:   81 e2 00 ff 00 00       and    edx,0xff00
>> >   35:   29 ca                   sub    edx,ecx
>> >   37:   89 d1                   mov    ecx,edx
>> >   39:   c1 f9 1f                sar    ecx,0x1f
>> >   3c:   31 ca                   xor    edx,ecx
>> >   3e:   29 ca                   sub    edx,ecx
>> >   40:   81 fa 00 07 00 00       cmp    edx,0x700
>> >   46:   7f 1b                   jg     63 <yuv_diff_0+0x63>
>> >   48:   40 0f b6 ff             movzx  edi,dil
>> >   4c:   40 0f b6 f6             movzx  esi,sil
>> >   50:   29 f7                   sub    edi,esi
>> >   52:   89 f8                   mov    eax,edi
>> >   54:   c1 f8 1f                sar    eax,0x1f
>> >   57:   31 c7                   xor    edi,eax
>> >   59:   29 c7                   sub    edi,eax
>> >   5b:   31 c0                   xor    eax,eax
>> >   5d:   83 ff 06                cmp    edi,0x6
>> >   60:   0f 9f c0                setg   al
>> >   63:   f3 c3                   repz ret
>> >   65:   90                      nop
>> >   66:   66 2e 0f 1f 84 00 00    nop    WORD PTR cs:[rax+rax*1+0x0]
>> >   6d:   00 00 00
>> >
>> > 0000000000000070 <yuv_diff_1>:
>> >   70:   89 fa                   mov    edx,edi
>> >   72:   89 f0                   mov    eax,esi
>> >   74:   81 e2 00 00 ff 00       and    edx,0xff0000
>> >   7a:   25 00 00 ff 00          and    eax,0xff0000
>> >   7f:   39 c2                   cmp    edx,eax
>> >   81:   76 4d                   jbe    d0 <yuv_diff_1+0x60>
>> >   83:   29 c2                   sub    edx,eax
>> >   85:   b8 01 00 00 00          mov    eax,0x1
>> >   8a:   81 fa 00 00 30 00       cmp    edx,0x300000
>> >   90:   77 7e                   ja     110 <yuv_diff_1+0xa0>
>> >   92:   89 fa                   mov    edx,edi
>> >   94:   89 f0                   mov    eax,esi
>> >   96:   81 e2 00 ff 00 00       and    edx,0xff00
>> >   9c:   25 00 ff 00 00          and    eax,0xff00
>> >   a1:   39 c2                   cmp    edx,eax
>> >   a3:   76 43                   jbe    e8 <yuv_diff_1+0x78>
>> >   a5:   29 c2                   sub    edx,eax
>> >   a7:   b8 01 00 00 00          mov    eax,0x1
>> >   ac:   81 fa 00 07 00 00       cmp    edx,0x700
>> >   b2:   77 74                   ja     128 <yuv_diff_1+0xb8>
>> >   b4:   40 0f b6 ff             movzx  edi,dil
>> >   b8:   40 0f b6 f6             movzx  esi,sil
>> >   bc:   39 f7                   cmp    edi,esi
>> >   be:   76 58                   jbe    118 <yuv_diff_1+0xa8>
>> >   c0:   29 f7                   sub    edi,esi
>> >   c2:   31 c0                   xor    eax,eax
>> >   c4:   83 ff 06                cmp    edi,0x6
>> >   c7:   0f 97 c0                seta   al
>> >   ca:   c3                      ret
>> >   cb:   0f 1f 44 00 00          nop    DWORD PTR [rax+rax*1+0x0]
>> >   d0:   29 d0                   sub    eax,edx
>> >   d2:   89 c2                   mov    edx,eax
>> >   d4:   b8 01 00 00 00          mov    eax,0x1
>> >   d9:   81 fa 00 00 30 00       cmp    edx,0x300000
>> >   df:   76 b1                   jbe    92 <yuv_diff_1+0x22>
>> >   e1:   f3 c3                   repz ret
>> >   e3:   0f 1f 44 00 00          nop    DWORD PTR [rax+rax*1+0x0]
>> >   e8:   29 d0                   sub    eax,edx
>> >   ea:   89 c2                   mov    edx,eax
>> >   ec:   b8 01 00 00 00          mov    eax,0x1
>> >   f1:   81 fa 00 07 00 00       cmp    edx,0x700
>> >   f7:   77 e8                   ja     e1 <yuv_diff_1+0x71>
>> >   f9:   40 0f b6 ff             movzx  edi,dil
>> >   fd:   40 0f b6 f6             movzx  esi,sil
>> >  101:   39 f7                   cmp    edi,esi
>> >  103:   76 13                   jbe    118 <yuv_diff_1+0xa8>
>> >  105:   eb b9                   jmp    c0 <yuv_diff_1+0x50>
>> >  107:   66 0f 1f 84 00 00 00    nop    WORD PTR [rax+rax*1+0x0]
>> >  10e:   00 00
>> >  110:   f3 c3                   repz ret
>> >  112:   66 0f 1f 44 00 00       nop    WORD PTR [rax+rax*1+0x0]
>> >  118:   29 fe                   sub    esi,edi
>> >  11a:   31 c0                   xor    eax,eax
>> >  11c:   83 fe 06                cmp    esi,0x6
>> >  11f:   0f 97 c0                seta   al
>> >  122:   c3                      ret
>> >  123:   0f 1f 44 00 00          nop    DWORD PTR [rax+rax*1+0x0]
>> >  128:   f3 c3                   repz ret
>> > [/tmp]☭
>> >
>> > I must say I'm slightly uncomfortable with that change.
>>
>> I don't know much assembly at the moment, can you please explain the 
>> difference
>> and what makes you uncomfortable?
>>
>
> Well, one is almost twice longer, with about 5x more branching. I'm pretty
> sure you can guess which one will be way slower (assuming it even behaves
> the same)


So I tested Nicolas' idea of using abs((int)a-(int)b).
That yields essentially identical asm, and as discussed on the thread,
is standards compliant.
I have a gut feeling it may be because abs() and the like already have
branch-less versions,
and there are issues (related to overflow) with unsigned subtraction, etc.
For some reason, with unsigned stuff, it seems like the compiler can't
generate such branch-less code.

I will implement Nicolas' idea unless there are objections.

>
> --
> Clément B.
>
> _______________________________________________
> ffmpeg-devel mailing list
> [email protected]
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
_______________________________________________
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 2/2] all: do standards compliant absdiff computation

Reply via email to