Andreas Rheinhardt:
> Up until now, ff_startcode_find_candidate_c() simply casts
> an uint8_t* to uint64_t*/uint32_t* to read 64/32 bits at a time
> in case HAVE_FAST_UNALIGNED is true. Yet this ignores the
> alignment requirement of these types as well as effective type
> rules of the C standard. This commit therefore replaces these
> direct accesses with AV_RN64/32; this also improves
> readability.
> 
> UBSan reported these unaligned accesses which happened in 233
> FATE-tests involving H.264 and VC-1 (this has also been reported
> in tickets #8138 and #8485); these tests are fixed by this commit.
> 
> The output of GCC with -O3 is unchanged for aarch64, alpha, arm,
> loongarch, ppc and x64. There was only a slight difference for mips.
> 
> Signed-off-by: Andreas Rheinhardt <andreas.rheinha...@outlook.com>
> ---
> This is v2 of 
> https://patchwork.ffmpeg.org/project/ffmpeg/patch/20200122145210.6898-1-andreas.rheinha...@gmail.com/
> 
> Here is the mips code before this change:
> 
> startcode_old_O3.o:     file format elf64-tradlittlemips
> 
> 
> Disassembly of section .text:
> 
> 0000000000000000 <ff_startcode_find_candidate_c>:
>    0: 18a00029        blez    a1,a8 <ff_startcode_find_candidate_c+0xa8>
>    4: 3c08ff7f        lui     a4,0xff7f
>    8: 3c07ff01        lui     a3,0xff01
>    c: 65087f7f        daddiu  a4,a4,32639
>   10: 64e70101        daddiu  a3,a3,257
>   14: 00084438        dsll    a4,a4,0x10
>   18: 00073c38        dsll    a3,a3,0x10
>   1c: 65087f7f        daddiu  a4,a4,32639
>   20: 64e70101        daddiu  a3,a3,257
>   24: 00084478        dsll    a4,a4,0x11
>   28: 00073df8        dsll    a3,a3,0x17
>   2c: 00803025        move    a2,a0
>   30: 00001025        move    v0,zero
>   34: 3508feff        ori     a4,a4,0xfeff
>   38: 10000005        b       50 <ff_startcode_find_candidate_c+0x50>
>   3c: 34e78080        ori     a3,a3,0x8080
>   40: 24420008        addiu   v0,v0,8
>   44: 0045182a        slt     v1,v0,a1
>   48: 10600015        beqz    v1,a0 <ff_startcode_find_candidate_c+0xa0>
>   4c: 00000000        nop
>   50: dcc30000        ld      v1,0(a2)
>   54: 0068482d        daddu   a5,v1,a4
>   58: 44a30000        dmtc1   v1,$f0
>   5c: 44a90800        dmtc1   a5,$f1
>   60: 4be10002        pandn   $f0,$f0,$f1
>   64: 44230000        dmfc1   v1,$f0
>   68: 00671824        and     v1,v1,a3
>   6c: 1060fff4        beqz    v1,40 <ff_startcode_find_candidate_c+0x40>
>   70: 64c60008        daddiu  a2,a2,8
>   74: 0045182a        slt     v1,v0,a1
>   78: 10600009        beqz    v1,a0 <ff_startcode_find_candidate_c+0xa0>
>   7c: 0082182d        daddu   v1,a0,v0
>   80: 10000005        b       98 <ff_startcode_find_candidate_c+0x98>
>   84: 90640000        lbu     a0,0(v1)
>   88: 24420001        addiu   v0,v0,1
>   8c: 10a20008        beq     a1,v0,b0 <ff_startcode_find_candidate_c+0xb0>
>   90: 00000000        nop
>   94: 90640000        lbu     a0,0(v1)
>   98: 1480fffb        bnez    a0,88 <ff_startcode_find_candidate_c+0x88>
>   9c: 64630001        daddiu  v1,v1,1
>   a0: 03e00008        jr      ra
>   a4: 00000000        nop
>   a8: 03e00008        jr      ra
>   ac: 00001025        move    v0,zero
>   b0: 03e00008        jr      ra
>   b4: 00a01025        move    v0,a1
>       ...
> 
> And here after this change:
> 
> startcode_new_O3.o:     file format elf64-tradlittlemips
> 
> 
> Disassembly of section .text:
> 
> 0000000000000000 <ff_startcode_find_candidate_c>:
>    0: 18a0002b        blez    a1,b0 <ff_startcode_find_candidate_c+0xb0>
>    4: 3c08ff7f        lui     a4,0xff7f
>    8: 3c07ff01        lui     a3,0xff01
>    c: 65087f7f        daddiu  a4,a4,32639
>   10: 64e70101        daddiu  a3,a3,257
>   14: 00084438        dsll    a4,a4,0x10
>   18: 00073c38        dsll    a3,a3,0x10
>   1c: 65087f7f        daddiu  a4,a4,32639
>   20: 64e70101        daddiu  a3,a3,257
>   24: 00084478        dsll    a4,a4,0x11
>   28: 00073df8        dsll    a3,a3,0x17
>   2c: 00803025        move    a2,a0
>   30: 00001025        move    v0,zero
>   34: 3508feff        ori     a4,a4,0xfeff
>   38: 10000005        b       50 <ff_startcode_find_candidate_c+0x50>
>   3c: 34e78080        ori     a3,a3,0x8080
>   40: 24420008        addiu   v0,v0,8
>   44: 0045182a        slt     v1,v0,a1
>   48: 10600017        beqz    v1,a8 <ff_startcode_find_candidate_c+0xa8>
>   4c: 00000000        nop
>   50: 68c30007        ldl     v1,7(a2)
>   54: 6cc30000        ldr     v1,0(a2)
>   58: 0068482d        daddu   a5,v1,a4
>   5c: 44a30000        dmtc1   v1,$f0
>   60: 44a90800        dmtc1   a5,$f1
>   64: 4be10002        pandn   $f0,$f0,$f1
>   68: 44230000        dmfc1   v1,$f0
>   6c: 00671824        and     v1,v1,a3
>   70: 1060fff3        beqz    v1,40 <ff_startcode_find_candidate_c+0x40>
>   74: 64c60008        daddiu  a2,a2,8
>   78: 0045182a        slt     v1,v0,a1
>   7c: 1060000a        beqz    v1,a8 <ff_startcode_find_candidate_c+0xa8>
>   80: 0082182d        daddu   v1,a0,v0
>   84: 10000006        b       a0 <ff_startcode_find_candidate_c+0xa0>
>   88: 90640000        lbu     a0,0(v1)
>   8c: 00000000        nop
>   90: 24420001        addiu   v0,v0,1
>   94: 10a20008        beq     a1,v0,b8 <ff_startcode_find_candidate_c+0xb8>
>   98: 00000000        nop
>   9c: 90640000        lbu     a0,0(v1)
>   a0: 1480fffb        bnez    a0,90 <ff_startcode_find_candidate_c+0x90>
>   a4: 64630001        daddiu  v1,v1,1
>   a8: 03e00008        jr      ra
>   ac: 00000000        nop
>   b0: 03e00008        jr      ra
>   b4: 00001025        move    v0,zero
>   b8: 03e00008        jr      ra
>   bc: 00a01025        move    v0,a1
> 
> As one can see, the difference is that an ld has been replaced
> by a pair of ldl and ldr. I don't know the performance implications
> of this.
> 
>  libavcodec/startcode.c | 9 +++++----
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
> index 9efdffe8c6..d84f326521 100644
> --- a/libavcodec/startcode.c
> +++ b/libavcodec/startcode.c
> @@ -25,6 +25,7 @@
>   * @author Michael Niedermayer <michae...@gmx.at>
>   */
>  
> +#include "libavutil/intreadwrite.h"
>  #include "startcode.h"
>  #include "config.h"
>  
> @@ -38,14 +39,14 @@ int ff_startcode_find_candidate_c(const uint8_t *buf, int 
> size)
>       */
>  #if HAVE_FAST_64BIT
>      while (i < size &&
> -            !((~*(const uint64_t *)(buf + i) &
> -                    (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
> +            !((~AV_RN64(buf + i) &
> +                    (AV_RN64(buf + i) - 0x0101010101010101ULL)) &
>                      0x8080808080808080ULL))
>          i += 8;
>  #else
>      while (i < size &&
> -            !((~*(const uint32_t *)(buf + i) &
> -                    (*(const uint32_t *)(buf + i) - 0x01010101U)) &
> +            !((~AV_RN32(buf + i) &
> +                    (AV_RN32(buf + i) - 0x01010101U)) &
>                      0x80808080U))
>          i += 4;
>  #endif

Unfortunately, the list in the commit message is incorrect: configure
enables fast_unaligned for aarch64, ppc, x86, loongarch, certain
loongson mips processors and armv6-8. The above claim that assembly for
arm is unaffected by this change is not correct when one actually deals
with one of the CPUs that are marked as fast unaligned. When compiling
for armv6 the assembly changes from

startcode_old_O3.o:     file format elf32-littlearm


Disassembly of section .text:

00000000 <ff_startcode_find_candidate_c>:
   0:   2900            cmp     r1, #0
   2:   dd20            ble.n   46 <ff_startcode_find_candidate_c+0x46>
   4:   f1a0 0c04       sub.w   ip, r0, #4
   8:   2300            movs    r3, #0
   a:   b410            push    {r4}
   c:   e002            b.n     14 <ff_startcode_find_candidate_c+0x14>
   e:   3304            adds    r3, #4
  10:   4299            cmp     r1, r3
  12:   dd14            ble.n   3e <ff_startcode_find_candidate_c+0x3e>
  14:   f85c 4f04       ldr.w   r4, [ip, #4]!
  18:   f1a4 3201       sub.w   r2, r4, #16843009       ; 0x1010101
  1c:   ea22 0204       bic.w   r2, r2, r4
  20:   f012 3f80       tst.w   r2, #2155905152 ; 0x80808080
  24:   d0f3            beq.n   e <ff_startcode_find_candidate_c+0xe>
  26:   4299            cmp     r1, r3
  28:   dd09            ble.n   3e <ff_startcode_find_candidate_c+0x3e>
  2a:   1e5a            subs    r2, r3, #1
  2c:   4402            add     r2, r0
  2e:   e002            b.n     36 <ff_startcode_find_candidate_c+0x36>
  30:   3301            adds    r3, #1
  32:   4299            cmp     r1, r3
  34:   d003            beq.n   3e <ff_startcode_find_candidate_c+0x3e>
  36:   f812 0f01       ldrb.w  r0, [r2, #1]!
  3a:   2800            cmp     r0, #0
  3c:   d1f8            bne.n   30 <ff_startcode_find_candidate_c+0x30>
  3e:   4618            mov     r0, r3
  40:   f85d 4b04       ldr.w   r4, [sp], #4
  44:   4770            bx      lr
  46:   2300            movs    r3, #0
  48:   4618            mov     r0, r3
  4a:   4770            bx      lr

to

startcode_new_O3.o:     file format elf32-littlearm


Disassembly of section .text:

00000000 <ff_startcode_find_candidate_c>:
   0:   2300            movs    r3, #0
   2:   2900            cmp     r1, #0
   4:   dc03            bgt.n   e <ff_startcode_find_candidate_c+0xe>
   6:   e017            b.n     38 <ff_startcode_find_candidate_c+0x38>
   8:   3304            adds    r3, #4
   a:   4299            cmp     r1, r3
   c:   dd14            ble.n   38 <ff_startcode_find_candidate_c+0x38>
   e:   f850 c003       ldr.w   ip, [r0, r3]
  12:   f1ac 3201       sub.w   r2, ip, #16843009       ; 0x1010101
  16:   ea22 020c       bic.w   r2, r2, ip
  1a:   f012 3f80       tst.w   r2, #2155905152 ; 0x80808080
  1e:   d0f3            beq.n   8 <ff_startcode_find_candidate_c+0x8>
  20:   4299            cmp     r1, r3
  22:   dd09            ble.n   38 <ff_startcode_find_candidate_c+0x38>
  24:   1e5a            subs    r2, r3, #1
  26:   4402            add     r2, r0
  28:   e002            b.n     30 <ff_startcode_find_candidate_c+0x30>
  2a:   3301            adds    r3, #1
  2c:   4299            cmp     r1, r3
  2e:   d003            beq.n   38 <ff_startcode_find_candidate_c+0x38>
  30:   f812 0f01       ldrb.w  r0, [r2, #1]!
  34:   2800            cmp     r0, #0
  36:   d1f8            bne.n   2a <ff_startcode_find_candidate_c+0x2a>
  38:   4618            mov     r0, r3
  3a:   4770            bx      lr

Once again, I have no clue about the performance implications of this
and would welcome any feedback on this.

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to