movenc: parse h264 packets to build Sync Sample and Recovery Point tables

James Almer Wed, 28 Jul 2021 17:21:01 -0700

On 7/28/2021 7:43 PM, Andreas Rheinhardt wrote:

James Almer:

Since we can't blindly trust the keyframe flag in packets and assume its
contents are a valid Sync Sample, do some basic bitstream parsing to build the
Sync Sample table in addition to a Random Access Recovery Point table.


Suggested-by: ffm...@fb.com
Signed-off-by: James Almer <jamr...@gmail.com>
---
  libavformat/movenc.c         | 125 +++++++++++++++++++++++++++++++++--
  libavformat/movenc.h         |   1 +
  tests/ref/lavf-fate/h264.mp4 |   6 +-
  3 files changed, 123 insertions(+), 9 deletions(-)

diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index 57062f45c5..159e0261b7 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -34,13 +34,17 @@
  #include "avc.h"
  #include "libavcodec/ac3_parser_internal.h"
  #include "libavcodec/dnxhddata.h"
+#include "libavcodec/h264.h"
+#include "libavcodec/h2645_parse.h"
  #include "libavcodec/flac.h"
  #include "libavcodec/get_bits.h"
+#include "libavcodec/golomb.h"

#include "libavcodec/internal.h"

  #include "libavcodec/put_bits.h"
  #include "libavcodec/vc1_common.h"
  #include "libavcodec/raw.h"
+#include "libavcodec/sei.h"
  #include "internal.h"
  #include "libavutil/avstring.h"
  #include "libavutil/channel_layout.h"
@@ -2537,7 +2541,9 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, 
MOVTrack *track)
      if (!sgpd_entries)
          return AVERROR(ENOMEM);

- av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == AV_CODEC_ID_AAC);

+    av_assert0(track->par->codec_id == AV_CODEC_ID_OPUS ||
+               track->par->codec_id == AV_CODEC_ID_AAC  ||
+               track->par->codec_id == AV_CODEC_ID_H264);

if (track->par->codec_id == AV_CODEC_ID_OPUS) {

          for (i = 0; i < track->entry; i++) {
@@ -2545,7 +2551,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, 
MOVTrack *track)
              int distance = 0;
              for (j = i - 1; j >= 0; j--) {
                  roll_samples_remaining -= get_cluster_duration(track, j);
-                distance++;
+                distance--;
                  if (roll_samples_remaining <= 0)
                      break;
              }
@@ -2555,7 +2561,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, 
MOVTrack *track)
              if (roll_samples_remaining > 0)
                  distance = 0;
              /* Verify distance is a maximum of 32 (2.5ms) packets. */
-            if (distance > 32)
+            if (distance < 32)
                  return AVERROR_INVALIDDATA;
              if (i && distance == sgpd_entries[entries].roll_distance) {
                  sgpd_entries[entries].count++;
@@ -2566,10 +2572,22 @@ static int mov_preroll_write_stbl_atoms(AVIOContext 
*pb, MOVTrack *track)
                  sgpd_entries[entries].group_description_index = distance ? 
++group : 0;
              }
          }
+    } else if (track->par->codec_id == AV_CODEC_ID_H264) {
+        for (i = 0; i < track->entry; i++) {
+            int distance = track->cluster[i].roll_distance;
+            if (i && distance == sgpd_entries[entries].roll_distance) {
+                sgpd_entries[entries].count++;
+            } else {
+                entries++;
+                sgpd_entries[entries].count = 1;
+                sgpd_entries[entries].roll_distance = distance;
+                sgpd_entries[entries].group_description_index = distance ? 
++group : 0;
+            }
+        }
      } else {
          entries++;
          sgpd_entries[entries].count = track->sample_count;
-        sgpd_entries[entries].roll_distance = 1;
+        sgpd_entries[entries].roll_distance = -1;


You seem to be negate roll_distance in this patch; shouldn't this be a
separate patch?


I'll split it.

          sgpd_entries[entries].group_description_index = ++group;
      }
      entries++;
@@ -2588,7 +2606,7 @@ static int mov_preroll_write_stbl_atoms(AVIOContext *pb, 
MOVTrack *track)
      avio_wb32(pb, group); /* entry_count */
      for (i = 0; i < entries; i++) {
          if (sgpd_entries[i].group_description_index) {
-            avio_wb16(pb, -sgpd_entries[i].roll_distance); /* roll_distance */
+            avio_wb16(pb, sgpd_entries[i].roll_distance); /* roll_distance */
          }
      }

@@ -2639,7 +2657,9 @@ static int mov_write_stbl_tag(AVFormatContext *s, AVIOContext *pb, MOVMuxContext

      if (track->cenc.aes_ctr) {
          ff_mov_cenc_write_stbl_atoms(&track->cenc, pb);
      }
-    if (track->par->codec_id == AV_CODEC_ID_OPUS || track->par->codec_id == 
AV_CODEC_ID_AAC) {
+    if (track->par->codec_id == AV_CODEC_ID_OPUS ||
+        track->par->codec_id == AV_CODEC_ID_AAC  ||
+        track->par->codec_id == AV_CODEC_ID_H264) {
          mov_preroll_write_stbl_atoms(pb, track);
      }
      return update_size(pb, pos);
@@ -5150,6 +5170,96 @@ static int mov_parse_mpeg2_frame(AVPacket *pkt, uint32_t 
*flags)
      return 0;
  }

+static int mov_parse_h264_frame(AVPacket *pkt, MOVTrack *trk)

+{
+    GetBitContext gb;
+    const uint8_t *buf = pkt->data;
+    const uint8_t *buf_end = pkt->data + pkt->size;
+    uint32_t state = -1;
+    unsigned roll_distance = 0;
+    int nal_length_size = 0, nalsize;
+    int idr = 0;
+
+    if (!pkt->size)
+        return 0;
+    if (trk->vos_data && trk->vos_data[0] == 1) {
+        if (trk->vos_len < 5)
+            return 0;
+        nal_length_size = (trk->vos_data[4] & 0x03) + 1;
+    }
+
+    while (buf_end - buf >= 4) {
+        if (nal_length_size) {
+            int i = 0;
+            nalsize = get_nalsize(nal_length_size, buf, buf_end - buf, &i, 
NULL);
+            if (nalsize < 0)
+                break;
+            state = buf[nal_length_size];
+            buf += nal_length_size + 1;
+        } else {
+            buf = avpriv_find_start_code(buf, buf_end, &state);


This checks the whole packet for start codes. But given that we have to
convert annex B input to ISOBMFF format anyway, this separate codepath
seems avoidable by working with already reformatted data here.
(In any case: In case this is not an IDR access unit you really check
the whole packet, even when you have already encountered a VLC NALU
despite SEIs having to be in front of them.)


I'll look into it.

+            if (buf >= buf_end)
+                break;
+        }
+
+        switch (state & 0x1f) {
+        case H264_NAL_IDR_SLICE:
+            idr = 1;
+            goto end;
+        case H264_NAL_SEI:
+            init_get_bits8(&gb, buf, buf_end - buf);


Weird that you checked the init_get_bits8() below but not this one.


The one below was copy-pasted. Will also check this one.

+            do {
+                GetBitContext gb_payload;
+                int ret, type = 0, size = 0;
+
+                do {
+                    if (get_bits_left(&gb) < 8)
+                        goto end;
+                    type += show_bits(&gb, 8);
+                } while (get_bits(&gb, 8) == 255);
+                do {
+                    if (get_bits_left(&gb) < 8)
+                        goto end;
+                    size += show_bits(&gb, 8);
+                } while (get_bits(&gb, 8) == 255);
+
+                if (size > get_bits_left(&gb) / 8)
+                    goto end;
+
+                ret = init_get_bits8(&gb_payload, gb.buffer + 
get_bits_count(&gb) / 8, size);
+                if (ret < 0)
+                    goto end;
+
+                switch (type) {
+                case SEI_TYPE_RECOVERY_POINT:
+                    roll_distance = get_ue_golomb_long(&gb_payload);
+                    break;
+                default:
+                    break;
+                }
+                skip_bits_long(&gb, 8 * size);
+            } while (get_bits_left(&gb) > 0 && show_bits(&gb, 8) != 0x80);


1. You are using the outer bitreader gb only for byte-aligned
reads/skips; better read bytes directly.

Using GetBitContext is cleaner and clearer IMO, but I could useGetByteContext instead i guess. I'd rather not work directly on bytes.

2. You are not unescaping the buffer: The SEI size is the size of an
unescaped SEI (after 0x03 has been stripped away). Knowing the size in
advance comes in really handy for this; this is where only working with
mp4-formatted data brings benefits.

A SEI NALu may have more than one SEI message, and the Recovery Pointmay not be the first. If i can't trust the size read in the loop abovebecause it does not take into account the emulation prevention bytes,how can i feasibly skip each SEI message within a give NALU?

+            break;
+        default:
+            break;
+        }
+        if (nal_length_size)
+            buf += nalsize - 1;
+    }
+
+end:
+    if (roll_distance != (int16_t)roll_distance)


The H.264 spec restricts this field to values in the range
0..MaxFrameNum - 1, where the latter can be UINT16_MAX. So I don't
understand why you are using a signed int16_t here and as type in MOVIentry.


Because ISOBMFF's roll_distance is an int16 field.

+        roll_distance = 0;
+    trk->cluster[trk->entry].roll_distance = roll_distance;
+
+    if (idr) {
+        trk->cluster[trk->entry].flags |= MOV_SYNC_SAMPLE;
+        trk->has_keyframes++;
+    }
+
+    return 0;
+}
+
  static void mov_parse_vc1_frame(AVPacket *pkt, MOVTrack *trk)
  {
      const uint8_t *start, *next, *end = pkt->data + pkt->size;
@@ -5740,6 +5850,7 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)
      trk->cluster[trk->entry].entries          = samples_in_chunk;
      trk->cluster[trk->entry].dts              = pkt->dts;
      trk->cluster[trk->entry].pts              = pkt->pts;
+    trk->cluster[trk->entry].roll_distance    = 0;
      if (!trk->entry && trk->start_dts != AV_NOPTS_VALUE) {
          if (!trk->frag_discont) {
              /* First packet of a new fragment. We already wrote the duration
@@ -5821,6 +5932,8 @@ int ff_mov_write_packet(AVFormatContext *s, AVPacket *pkt)

if (par->codec_id == AV_CODEC_ID_VC1) {

          mov_parse_vc1_frame(pkt, trk);
+    } else if (par->codec_id == AV_CODEC_ID_H264) {
+        mov_parse_h264_frame(pkt, trk);
      } else if (par->codec_id == AV_CODEC_ID_TRUEHD) {
          mov_parse_truehd_frame(pkt, trk);
      } else if (pkt->flags & AV_PKT_FLAG_KEY) {
diff --git a/libavformat/movenc.h b/libavformat/movenc.h
index af1ea0bce6..73bf73ce8f 100644
--- a/libavformat/movenc.h
+++ b/libavformat/movenc.h
@@ -56,6 +56,7 @@ typedef struct MOVIentry {
  #define MOV_PARTIAL_SYNC_SAMPLE 0x0002
  #define MOV_DISPOSABLE_SAMPLE   0x0004
      uint32_t     flags;
+    int16_t      roll_distance;
      AVProducerReferenceTime prft;
  } MOVIentry;

diff --git a/tests/ref/lavf-fate/h264.mp4 b/tests/ref/lavf-fate/h264.mp4

index a9c3823c2c..c08ee4c7ae 100644
--- a/tests/ref/lavf-fate/h264.mp4
+++ b/tests/ref/lavf-fate/h264.mp4
@@ -1,3 +1,3 @@
-fe299ea5205b71a48281f917b1256a5d *tests/data/lavf-fate/lavf.h264.mp4
-547928 tests/data/lavf-fate/lavf.h264.mp4
-tests/data/lavf-fate/lavf.h264.mp4 CRC=0x9da2c999
+2812f617314d23474fcb23898b8a56ab *tests/data/lavf-fate/lavf.h264.mp4
+548084 tests/data/lavf-fate/lavf.h264.mp4
+tests/data/lavf-fate/lavf.h264.mp4 CRC=0x396f0829

I guess this filesize increase is due to the SyncSample table? What
exactly has been written before this patch, what gets written with it?

The size increase is due to the addition of Recovery Point samplegroups. The Sync Sample table is smaller now that we're not listing therecovery point packets in it, but ultimately the file is bigger becauseof the presence of a sgpd and sbgp full boxes.


- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/2] avformat/movenc: parse h264 packets to build Sync Sample and Recovery Point tables

Reply via email to