On 11/11/18 22:43, Andreas Rheinhardt wrote: > Instead of using a combination of bitreader and -writer for copying data, > one can byte-align the (obsolete and removed) bitreader to improve > performance. > With the right alignment one can even use memcpy. The right alignment > normally exists for CABAC and hence for H.265 in general. > For aligned data this reduced the time to copy the slicedata from > 776520 decicycles to 33889 with 262144 runs and a 6.5mb/s H.264 video. > For unaligned data the number went down from 279196 to 97739 decicycles. > --- > libavcodec/cbs_h2645.c | 119 ++++++++++++++++++++++++----------------- > 1 file changed, 69 insertions(+), 50 deletions(-) > > diff --git a/libavcodec/cbs_h2645.c b/libavcodec/cbs_h2645.c > index e55bd00183..416d3fd32a 100644 > --- a/libavcodec/cbs_h2645.c > +++ b/libavcodec/cbs_h2645.c > @@ -1050,6 +1050,64 @@ static int > cbs_h265_read_nal_unit(CodedBitstreamContext *ctx, > return 0; > } > > +static int cbs_h2645_write_slice_data(CodedBitstreamContext *ctx, > + PutBitContext *pbc, const uint8_t > *data, > + size_t data_size, int data_bit_start) > +{ > + size_t rest = data_size - (data_bit_start + 7) / 8; > + const uint8_t *pos = data + data_bit_start / 8; > + > + av_assert0(data_bit_start >= 0 && > + 8 * data_size > data_bit_start); > + > + if (data_size * 8 + 8 > put_bits_left(pbc)) > + return AVERROR(ENOSPC); > + > + if (!rest) > + goto rbsp_stop_one_bit; > + > + // First copy the remaining bits of the first byte > + // The above check ensures that we do not accidentally > + // copy beyond the rbsp_stop_one_bit. > + if (data_bit_start % 8) > + put_bits(pbc, 8 - data_bit_start % 8, > + *pos++ & MAX_UINT_BITS(8 - data_bit_start % 8)); > + > + if (put_bits_count(pbc) % 8 == 0) { > + // If the writer is aligned at this point, > + // memcpy can be used to improve performance. > + // This happens normally for CABAC. > + flush_put_bits(pbc); > + memcpy(put_bits_ptr(pbc), pos, rest); > + skip_put_bytes(pbc, rest); > + } else { > + // If not, we have to copy manually. > + // rbsp_stop_one_bit forces us to special-case > + // the last byte. > + uint8_t temp; > + int i; > + > + for (; rest > 4; rest -= 4, pos += 4) > + put_bits32(pbc, AV_RB32(pos)); > + > + for (; rest > 1; rest--, pos++) > + put_bits(pbc, 8, *pos); > + > + rbsp_stop_one_bit: > + temp = rest ? *pos : *pos & MAX_UINT_BITS(8 - data_bit_start % 8); > + > + av_assert0(temp); > + i = ff_ctz(*pos); > + temp = temp >> i; > + i = rest ? (8 - i) : (8 - i - data_bit_start % 8); > + put_bits(pbc, i, temp); > + if (put_bits_count(pbc) % 8) > + put_bits(pbc, 8 - put_bits_count(pbc) % 8, 0U); > + } > + > + return 0; > +} > + > static int cbs_h264_write_nal_unit(CodedBitstreamContext *ctx, > CodedBitstreamUnit *unit, > PutBitContext *pbc) > @@ -1100,37 +1158,17 @@ static int > cbs_h264_write_nal_unit(CodedBitstreamContext *ctx, > case H264_NAL_AUXILIARY_SLICE: > { > H264RawSlice *slice = unit->content; > - GetBitContext gbc; > - int bits_left, end, zeroes; > > err = cbs_h264_write_slice_header(ctx, pbc, &slice->header); > if (err < 0) > return err; > > if (slice->data) { > - if (slice->data_size * 8 + 8 > put_bits_left(pbc)) > - return AVERROR(ENOSPC); > - > - init_get_bits(&gbc, slice->data, slice->data_size * 8); > - skip_bits_long(&gbc, slice->data_bit_start); > - > - // Copy in two-byte blocks, but stop before copying the > - // rbsp_stop_one_bit in the final byte. > - while (get_bits_left(&gbc) > 23) > - put_bits(pbc, 16, get_bits(&gbc, 16)); > - > - bits_left = get_bits_left(&gbc); > - end = get_bits(&gbc, bits_left); > - > - // rbsp_stop_one_bit must be present here. > - av_assert0(end); > - zeroes = ff_ctz(end); > - if (bits_left > zeroes + 1) > - put_bits(pbc, bits_left - zeroes - 1, > - end >> (zeroes + 1)); > - put_bits(pbc, 1, 1); > - while (put_bits_count(pbc) % 8 != 0) > - put_bits(pbc, 1, 0); > + err = cbs_h2645_write_slice_data(ctx, pbc, slice->data, > + slice->data_size, > + slice->data_bit_start); > + if (err < 0) > + return err; > } else { > // No slice data - that was just the header. > // (Bitstream may be unaligned!) > @@ -1254,39 +1292,20 @@ static int > cbs_h265_write_nal_unit(CodedBitstreamContext *ctx, > case HEVC_NAL_CRA_NUT: > { > H265RawSlice *slice = unit->content; > - GetBitContext gbc; > - int bits_left, end, zeroes; > > err = cbs_h265_write_slice_segment_header(ctx, pbc, > &slice->header); > if (err < 0) > return err; > > if (slice->data) { > - if (slice->data_size * 8 + 8 > put_bits_left(pbc)) > - return AVERROR(ENOSPC); > - > - init_get_bits(&gbc, slice->data, slice->data_size * 8); > - skip_bits_long(&gbc, slice->data_bit_start); > - > - // Copy in two-byte blocks, but stop before copying the > - // rbsp_stop_one_bit in the final byte. > - while (get_bits_left(&gbc) > 23) > - put_bits(pbc, 16, get_bits(&gbc, 16)); > - > - bits_left = get_bits_left(&gbc); > - end = get_bits(&gbc, bits_left); > - > - // rbsp_stop_one_bit must be present here. > - av_assert0(end); > - zeroes = ff_ctz(end); > - if (bits_left > zeroes + 1) > - put_bits(pbc, bits_left - zeroes - 1, > - end >> (zeroes + 1)); > - put_bits(pbc, 1, 1); > - while (put_bits_count(pbc) % 8 != 0) > - put_bits(pbc, 1, 0); > + err = cbs_h2645_write_slice_data(ctx, pbc, slice->data, > + slice->data_size, > + slice->data_bit_start); > + if (err < 0) > + return err; > } else { > // No slice data - that was just the header. > + // (Bitstream may be unaligned!)
This comment change isn't accurate - the bitstream will always be aligned for H.265 in this case because byte alignment is included at the end of the slice segment header. (I've just removed it.) > } > } > break; > LGTM, tested, applied. Thanks! - Mark On 11/11/18 22:32, Andreas Rheinhardt wrote: > ... Btw: What was the normal speedup you got when copying in the aligned > mode? Macro-level tests come out very well here. Test files: A 900 frames of single-slice 4K H.265, 1.5GB (900 writes of slice data, averaging 1.6MB each). B 38074 frames of 32-slice 1080p H.264, 1.5GB (1.2m writes of slice data, averaging 1.2kB each). In each case, get input file into memory, "./ffmpeg -i input-file -c:v copy -bsf:v hxxx_metadata -f null -", run five times and average the result. Intel 8700 (Coffee Lake): A ~83fps -> ~274fps B ~5150fps -> ~8160fps Rockchip 3288 (Cortex A15): A ~31fps -> ~48fps B ~1210fps -> ~1700fps So, around 50% increase in hxxx_metadata throughput for these cases. (And I guess Intel is very good at large memcpy for the first one.) _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel