On Sat, 24 Mar 2018 15:48:36 +0100 wm4 <nfx...@googlemail.com> wrote:
> Subtitles which contained styled UTF-8 subtitles (i.e. not just 7 bit > ASCII characters) were not handled correctly. The spec mandates that > styling start/end ranges are in "characters". It's not quite clear > what a "character" is supposed to be, but maybe they mean unicode > codepoints. > > FFmpeg's decoder treated the style ranges as byte idexes, which could > lead to UTF-8 sequences being broken, and the common code dropping the > whole subtitle line. > > Change this and count the codepoint instead. This also means that even > if this is somehow wrong, the decoder won't break UTF-8 sequences > anymore. The sample which led me to investigate this now appears to > work correctly. > --- > https://github.com/mpv-player/mpv/issues/5675 > --- > libavcodec/movtextdec.c | 50 > ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 > insertions(+), 13 deletions(-) > > diff --git a/libavcodec/movtextdec.c b/libavcodec/movtextdec.c > index bd19577724..89ac791602 100644 > --- a/libavcodec/movtextdec.c > +++ b/libavcodec/movtextdec.c > @@ -326,9 +326,24 @@ static const Box box_types[] = { > > const static size_t box_count = FF_ARRAY_ELEMS(box_types); > > +// Return byte length of the UTF-8 sequence starting at text[0]. 0 > on error. +static int get_utf8_length_at(const char *text, const char > *text_end) +{ > + const char *start = text; > + int err = 0; > + uint32_t c; > + GET_UTF8(c, text < text_end ? (uint8_t)*text++ : (err = 1, 0), > goto error;); > + if (err) > + goto error; > + return text - start; > +error: > + return 0; > +} > + > static int text_to_ass(AVBPrint *buf, const char *text, const char > *text_end, > - MovTextContext *m) > + AVCodecContext *avctx) > { > + MovTextContext *m = avctx->priv_data; > int i = 0; > int j = 0; > int text_pos = 0; > @@ -342,6 +357,8 @@ static int text_to_ass(AVBPrint *buf, const char > *text, const char *text_end, } > > while (text < text_end) { > + int len; > + > if (m->box_flags & STYL_BOX) { > for (i = 0; i < m->style_entries; i++) { > if (m->s[i]->style_flag && text_pos == > m->s[i]->style_end) { @@ -388,17 +405,24 @@ static int > text_to_ass(AVBPrint *buf, const char *text, const char *text_end, } > } > > - switch (*text) { > - case '\r': > - break; > - case '\n': > - av_bprintf(buf, "\\N"); > - break; > - default: > - av_bprint_chars(buf, *text, 1); > - break; > + len = get_utf8_length_at(text, text_end); > + if (len < 1) { > + av_log(avctx, AV_LOG_ERROR, "invalid UTF-8 byte in > subtitle\n"); > + len = 1; > + } > + for (i = 0; i < len; i++) { > + switch (*text) { > + case '\r': > + break; > + case '\n': > + av_bprintf(buf, "\\N"); > + break; > + default: > + av_bprint_chars(buf, *text, 1); > + break; > + } > + text++; > } > - text++; > text_pos++; > } > > @@ -507,10 +531,10 @@ static int mov_text_decode_frame(AVCodecContext > *avctx, } > m->tracksize = m->tracksize + tsmb_size; > } > - text_to_ass(&buf, ptr, end, m); > + text_to_ass(&buf, ptr, end, avctx); > mov_text_cleanup(m); > } else > - text_to_ass(&buf, ptr, end, m); > + text_to_ass(&buf, ptr, end, avctx); > > ret = ff_ass_add_rect(sub, buf.str, m->readorder++, 0, NULL, > NULL); av_bprint_finalize(&buf, NULL); Ship it. Thanks! --phil _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel