Up until now, ff_startcode_find_candidate_c() simply casts
an uint8_t* to uint64_t*/uint32_t* to read 64/32 bits at a time
in case HAVE_FAST_UNALIGNED is true. Yet this ignores the
alignment requirement of these types as well as effective type
rules of the C standard. This commit therefore replaces these
direct accesses with AV_RN64/32; this also improves
readability.

UBSan reported these unaligned accesses which happened in 233
FATE-tests involving H.264 and VC-1 (this has also been reported
in tickets #8138 and #8485); these tests are fixed by this commit.

The output of GCC with -O3 is unchanged for aarch64, alpha, arm,
loongarch, ppc and x64. There was only a slight difference for mips.

Signed-off-by: Andreas Rheinhardt <andreas.rheinha...@outlook.com>
---
This is v2 of 
https://patchwork.ffmpeg.org/project/ffmpeg/patch/20200122145210.6898-1-andreas.rheinha...@gmail.com/

Here is the mips code before this change:

startcode_old_O3.o:     file format elf64-tradlittlemips


Disassembly of section .text:

0000000000000000 <ff_startcode_find_candidate_c>:
   0:   18a00029        blez    a1,a8 <ff_startcode_find_candidate_c+0xa8>
   4:   3c08ff7f        lui     a4,0xff7f
   8:   3c07ff01        lui     a3,0xff01
   c:   65087f7f        daddiu  a4,a4,32639
  10:   64e70101        daddiu  a3,a3,257
  14:   00084438        dsll    a4,a4,0x10
  18:   00073c38        dsll    a3,a3,0x10
  1c:   65087f7f        daddiu  a4,a4,32639
  20:   64e70101        daddiu  a3,a3,257
  24:   00084478        dsll    a4,a4,0x11
  28:   00073df8        dsll    a3,a3,0x17
  2c:   00803025        move    a2,a0
  30:   00001025        move    v0,zero
  34:   3508feff        ori     a4,a4,0xfeff
  38:   10000005        b       50 <ff_startcode_find_candidate_c+0x50>
  3c:   34e78080        ori     a3,a3,0x8080
  40:   24420008        addiu   v0,v0,8
  44:   0045182a        slt     v1,v0,a1
  48:   10600015        beqz    v1,a0 <ff_startcode_find_candidate_c+0xa0>
  4c:   00000000        nop
  50:   dcc30000        ld      v1,0(a2)
  54:   0068482d        daddu   a5,v1,a4
  58:   44a30000        dmtc1   v1,$f0
  5c:   44a90800        dmtc1   a5,$f1
  60:   4be10002        pandn   $f0,$f0,$f1
  64:   44230000        dmfc1   v1,$f0
  68:   00671824        and     v1,v1,a3
  6c:   1060fff4        beqz    v1,40 <ff_startcode_find_candidate_c+0x40>
  70:   64c60008        daddiu  a2,a2,8
  74:   0045182a        slt     v1,v0,a1
  78:   10600009        beqz    v1,a0 <ff_startcode_find_candidate_c+0xa0>
  7c:   0082182d        daddu   v1,a0,v0
  80:   10000005        b       98 <ff_startcode_find_candidate_c+0x98>
  84:   90640000        lbu     a0,0(v1)
  88:   24420001        addiu   v0,v0,1
  8c:   10a20008        beq     a1,v0,b0 <ff_startcode_find_candidate_c+0xb0>
  90:   00000000        nop
  94:   90640000        lbu     a0,0(v1)
  98:   1480fffb        bnez    a0,88 <ff_startcode_find_candidate_c+0x88>
  9c:   64630001        daddiu  v1,v1,1
  a0:   03e00008        jr      ra
  a4:   00000000        nop
  a8:   03e00008        jr      ra
  ac:   00001025        move    v0,zero
  b0:   03e00008        jr      ra
  b4:   00a01025        move    v0,a1
        ...

And here after this change:

startcode_new_O3.o:     file format elf64-tradlittlemips


Disassembly of section .text:

0000000000000000 <ff_startcode_find_candidate_c>:
   0:   18a0002b        blez    a1,b0 <ff_startcode_find_candidate_c+0xb0>
   4:   3c08ff7f        lui     a4,0xff7f
   8:   3c07ff01        lui     a3,0xff01
   c:   65087f7f        daddiu  a4,a4,32639
  10:   64e70101        daddiu  a3,a3,257
  14:   00084438        dsll    a4,a4,0x10
  18:   00073c38        dsll    a3,a3,0x10
  1c:   65087f7f        daddiu  a4,a4,32639
  20:   64e70101        daddiu  a3,a3,257
  24:   00084478        dsll    a4,a4,0x11
  28:   00073df8        dsll    a3,a3,0x17
  2c:   00803025        move    a2,a0
  30:   00001025        move    v0,zero
  34:   3508feff        ori     a4,a4,0xfeff
  38:   10000005        b       50 <ff_startcode_find_candidate_c+0x50>
  3c:   34e78080        ori     a3,a3,0x8080
  40:   24420008        addiu   v0,v0,8
  44:   0045182a        slt     v1,v0,a1
  48:   10600017        beqz    v1,a8 <ff_startcode_find_candidate_c+0xa8>
  4c:   00000000        nop
  50:   68c30007        ldl     v1,7(a2)
  54:   6cc30000        ldr     v1,0(a2)
  58:   0068482d        daddu   a5,v1,a4
  5c:   44a30000        dmtc1   v1,$f0
  60:   44a90800        dmtc1   a5,$f1
  64:   4be10002        pandn   $f0,$f0,$f1
  68:   44230000        dmfc1   v1,$f0
  6c:   00671824        and     v1,v1,a3
  70:   1060fff3        beqz    v1,40 <ff_startcode_find_candidate_c+0x40>
  74:   64c60008        daddiu  a2,a2,8
  78:   0045182a        slt     v1,v0,a1
  7c:   1060000a        beqz    v1,a8 <ff_startcode_find_candidate_c+0xa8>
  80:   0082182d        daddu   v1,a0,v0
  84:   10000006        b       a0 <ff_startcode_find_candidate_c+0xa0>
  88:   90640000        lbu     a0,0(v1)
  8c:   00000000        nop
  90:   24420001        addiu   v0,v0,1
  94:   10a20008        beq     a1,v0,b8 <ff_startcode_find_candidate_c+0xb8>
  98:   00000000        nop
  9c:   90640000        lbu     a0,0(v1)
  a0:   1480fffb        bnez    a0,90 <ff_startcode_find_candidate_c+0x90>
  a4:   64630001        daddiu  v1,v1,1
  a8:   03e00008        jr      ra
  ac:   00000000        nop
  b0:   03e00008        jr      ra
  b4:   00001025        move    v0,zero
  b8:   03e00008        jr      ra
  bc:   00a01025        move    v0,a1

As one can see, the difference is that an ld has been replaced
by a pair of ldl and ldr. I don't know the performance implications
of this.

 libavcodec/startcode.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
index 9efdffe8c6..d84f326521 100644
--- a/libavcodec/startcode.c
+++ b/libavcodec/startcode.c
@@ -25,6 +25,7 @@
  * @author Michael Niedermayer <michae...@gmx.at>
  */
 
+#include "libavutil/intreadwrite.h"
 #include "startcode.h"
 #include "config.h"
 
@@ -38,14 +39,14 @@ int ff_startcode_find_candidate_c(const uint8_t *buf, int 
size)
      */
 #if HAVE_FAST_64BIT
     while (i < size &&
-            !((~*(const uint64_t *)(buf + i) &
-                    (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
+            !((~AV_RN64(buf + i) &
+                    (AV_RN64(buf + i) - 0x0101010101010101ULL)) &
                     0x8080808080808080ULL))
         i += 8;
 #else
     while (i < size &&
-            !((~*(const uint32_t *)(buf + i) &
-                    (*(const uint32_t *)(buf + i) - 0x01010101U)) &
+            !((~AV_RN32(buf + i) &
+                    (AV_RN32(buf + i) - 0x01010101U)) &
                     0x80808080U))
         i += 4;
 #endif
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to