Until now the function ff_startcode_find_candidate_c did not really search for startcodes (the startcode 0x00 0x00 0x01 (used in MPEG-1/2/4, VC-1 and H.264/5) is the only startcode meant here). Instead it searched for zero bytes and returned the earliest position of a zero byte. This of course led to lots of false positives - millions per GB of video. This has been changed: The first position of the buffer that may be part of a four-byte startcode is now returned. This excludes zero bytes that are known not to belong to a startcode, but zero bytes at the end of a buffer that might be part of a startcode whose second part is in the next buffer are also returned. This is in accordance with the expectations of the current callers of ff_startcode_find_candidate_c, namely the H.264 parser and the VC-1 parser. This is also the reason why "find_candidate" in the name of the function has been kept.
Getting rid of lots of function calls with their accompanying overhead of course brings a speedup with it: Here are some benchmarks for a 30.2 Mb/s transport stream A (10 iterations of 8192 runs each) and another 7.4 Mb/s transport stream B (10 iterations of 131072 runs each) on an x64 Haswell: | vanilla | aligned | current: aligned, | (unaligned) | reads | no false positives --|-------------|---------|------------------- A | 411578 | 424503 | 323355 B | 55476 | 58326 | 44147 vanilla refers to the state before the switch to aligned reads; "aligned reads" refers to the state after the switch to aligned reads. (Calls to h264_find_frame_end have been timed; given that the amount of calls to ff_startcode_find_candidate_c has been reduced considerably, timing them directly would be worthless.) Signed-off-by: Andreas Rheinhardt <andreas.rheinha...@gmail.com> --- libavcodec/h264dsp.h | 7 +-- libavcodec/startcode.c | 101 +++++++++++++++++++++++++++++++++++------ libavcodec/vc1dsp.h | 6 +-- 3 files changed, 91 insertions(+), 23 deletions(-) diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h index cbea3173c6..51de7a4e21 100644 --- a/libavcodec/h264dsp.h +++ b/libavcodec/h264dsp.h @@ -108,11 +108,8 @@ typedef struct H264DSPContext { void (*h264_add_pixels4_clear)(uint8_t *dst, int16_t *block, int stride); /** - * Search buf from the start for up to size bytes. Return the index - * of a zero byte, or >= size if not found. Ideally, use lookahead - * to filter out any zero bytes that are known to not be followed by - * one or more further zero bytes and a one byte. Better still, filter - * out any bytes that form the trailing_zero_8bits syntax element too. + * Search buf from the start for up to size bytes. Return the first 0x00 + * that might be part of a (three or four) byte startcode. */ int (*startcode_find_candidate)(const uint8_t *buf, int size); } H264DSPContext; diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c index 8774109df5..b29c1490c0 100644 --- a/libavcodec/startcode.c +++ b/libavcodec/startcode.c @@ -22,35 +22,75 @@ * @file * Accelerated start code search function for start codes common to * MPEG-1/2/4 video, VC-1, H.264/5 - * @author Michael Niedermayer <michae...@gmx.at> + * @author Michael Niedermayer <michae...@gmx.at>, + * @author Andreas Rheinhardt <andreas.rheinha...@gmail.com> */ #include "startcode.h" #include "config.h" #include "libavutil/intreadwrite.h" +#include "libavutil/macros.h" int ff_startcode_find_candidate_c(const uint8_t *buf, int size) { - const uint8_t *start = buf, *end = buf + size; + const uint8_t *start = buf, *end = buf + size, *temp; #define INITIALIZATION(mod) do { \ - if (mod > end - start) \ + if (end - start <= mod + 1) \ goto near_end; \ - for (; (uintptr_t)buf % mod; buf++) \ - if (!*buf) \ - return buf - start; \ + /* From this point on until the end of the MAIN_LOOP, \ + * buf is the earliest possible position of a 0x00 \ + * immediately preceding a startcode's 0x01, i.e. \ + * everything from start to buf (inclusive) is known \ + * to not contain a startcode's 0x01. */ \ + buf += 1; \ + temp = (const uint8_t *)FFALIGN((uintptr_t)buf, mod); \ /* Effective end for MAIN_LOOP in order not to overread. */ \ - end -= mod; \ + end -= mod + 1; \ + goto startcode_check; \ } while (0) #define READ(bitness) AV_RN ## bitness ## A +#define STARTCODE_CHECK(offset) do { \ + if (buf[2 - offset] > 1) { \ + buf += 3; \ + } else if (buf[1 - offset]) { \ + buf += 2; \ + } else if (buf[-offset] || buf[2 - offset] == 0) { \ + buf += 1; \ + } else { \ + buf += 1 - offset; \ + goto found_startcode; \ + } \ + } while (0) + + /* The MAIN_LOOP tries to read several bytes at once. + * A startcode's 0x00 0x01 or 0x00 0x00 will be detected + * by it if these bytes are contained within the bytes + * read at once. */ #define MAIN_LOOP(bitness, mask1, mask2) do { \ - for (; buf <= end; buf += bitness / 8) \ - if ((~READ(bitness)(buf) & (READ(bitness)(buf) - mask1)) \ - & mask2) \ - break; \ + for (; buf < end; ) { \ + if (!((~READ(bitness)(buf) & (READ(bitness)(buf) - mask1)) \ + & mask2)) { \ + buf += bitness / 8; \ + continue; \ + } \ + temp = buf + bitness / 8; \ + /* If the 0x01 of a startcode is at position buf + 1, \ + * the following check detects it. \ + * Because buf got incremented before entering this \ + * loop, buf - 1 may be evaluated. \ + * Because temp + 1 < start + size, buf is always \ + * <= the end of the buffer during the closing check \ + * so that the pointer comparison is defined. */ \ + startcode_check: \ + do { \ + STARTCODE_CHECK(1); \ + } while (buf < temp); \ + buf = temp; \ + } \ /* Revert to the real end. */ \ - end += bitness / 8; \ + end += bitness / 8 + 1; \ } while (0) #if HAVE_FAST_64BIT @@ -61,9 +101,42 @@ int ff_startcode_find_candidate_c(const uint8_t *buf, int size) MAIN_LOOP(32, 0x01010101U, 0x80808080U); #endif + /* For the end, buf is the earliest possible position + * of a three-byte startcode's leading 0x00. This is + * done in order not to run into the undefined behaviour + * explained below. */ + buf--; near_end: - for (; buf < end; buf++) - if (!*buf) + while (2 < end - buf) { + /* The STARTCODE_CHECK in the MAIN_LOOP can't be used + * to check whether the file ends with a startcode, + * because for this buf would need to be end - 2. + * But depending on the value of end[-1], buf might get + * incremented by 3 and therefore be beyond end. But + * pointer arithmetic beyond end is undefined behaviour. + * So a STARTCODE_CHECK with a different offset is used + * here: It detects whether buf points to the first 0x00 + * of a three-byte startcode. */ + STARTCODE_CHECK(0); + } + + /* No startcode found, but if the last bytes vanish, + * they may be part of a startcode whose end is not + * in the current buffer. Return the offset of the + * earliest element that may belong to a startcode. */ + for (buf = end; buf > (end - start > 3 ? end - 3 : start); buf--) + if (buf[-1]) break; + + return buf - start; + +found_startcode: + /* buf points to the byte preceding a startcode's 0x01. + * Check whether it is a four-byte startcode. */ + + buf -= 1; + if (buf > start && buf[-1] == 0) + buf--; + return buf - start; } diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h index 75db62b1b4..be9468add3 100644 --- a/libavcodec/vc1dsp.h +++ b/libavcodec/vc1dsp.h @@ -74,10 +74,8 @@ typedef struct VC1DSPContext { int alpha, int width); /** - * Search buf from the start for up to size bytes. Return the index - * of a zero byte, or >= size if not found. Ideally, use lookahead - * to filter out any zero bytes that are known to not be followed by - * one or more further zero bytes and a one byte. + * Search buf from the start for up to size bytes. Return the first 0x00 + * that might be part of a (three or four) byte startcode. */ int (*startcode_find_candidate)(const uint8_t *buf, int size); } VC1DSPContext; -- 2.21.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".