On 2017-08-31, Guenter Milde wrote: Dear LyX developers,
is there a chance to fix the following problems instead of just documenting them? > lyx2lyx deletes ZWSP characters following literal em- and en-dashes when > converting to 2.3 format. If you used literal ZWSP characters (u200b) as > optional line breaks after dashes, convert them to 0dd wide space insets > before opening your document with LyX 2.3 or the optional line breaks will > be lost! ... > If using TeX fonts and en- and em-dashes are output as font ligatures, > when exporting documents containing en- and em-dashes to the format of > LyX 2.0 or earlier, the following line has to be manually added to the > unicodesymbols file of that LyX version:<br> > 0x200b "\\hspace{0pt}" "" "" "" "" # ZERO WIDTH SPACE<br> > This avoids "uncodable character" issues if the document is actually > loaded by that LyX version. LyX 2.1 and later versions already have the > necessary definition in their unicodesymbols file Export to 2.1 and older changes dashes from ligature to literal. Proposed changes (see patch below): 1. detect presence of literal as well as ligature dashes in old documents and use it for setting \use_dash_ligatures: \use_dash_ligatures = default if none are found \use_dash_ligatures = True if ligature dash(s) (\t*hyphens) found \use_dash_ligatures = False if literal dash(s) found \use_dash_ligatures = default if both are found (+ issue a warning) 2. back-convert dashes to \twohyphens / \threehyphens if \use_dash_ligatures = True This means * unchanged behaviour with LyX <= 2.1 * unchanged behaviour after round-trip (unless edited with 2.2) 3. do not use invisible optional line break characters (ZWSP) in backwards conversion. Günter diff --git a/lib/lyx2lyx/lyx_2_3.py b/lib/lyx2lyx/lyx_2_3.py index 73ac45cf00..6305d417b0 100644 --- a/lib/lyx2lyx/lyx_2_3.py +++ b/lib/lyx2lyx/lyx_2_3.py @@ -1841,103 +1841,57 @@ def revert_chapterbib(document): def convert_dashligatures(document): - " Remove a zero-length space (U+200B) after en- and em-dashes. " - + " Set use_dash_ligatures according to content (literal vs. 'ligature' dashes) " + # Default: + use_dash_ligatures = False # TODO: Get the default from stdtemplate.lyx + # Look for dashes (followed by a word or no-break space): + # (Documents by LyX 2.1 or older have "\twohyphens\n" or "\threehyphens\n" + # as interim representation for dash ligatures in 2.2.) + has_literal_dashes = has_ligature_dashes = False + for i, line in enumerate(document.body): + if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line, flags=re.UNICODE): + has_literal_dashes = True + if re.search(ur"(\\twohyphens|\\threehyphens)", line, flags=re.UNICODE): + # print "dash in line ", i, document.body[i+1].encode('utf8') + if re.match(u"[\w\u00A0]", document.body[i+1], flags=re.UNICODE): + has_ligature_dashes = True + if has_literal_dashes and has_ligature_dashes: + # TODO: insert a warning note in the document? + document.warning("""This document contained both literal and "ligature" dashes. + Line break may have changed. See UserGuide chapter 3.9.1 for details.""") + elif has_literal_dashes: + # print "has literal dashes" + use_dash_ligatures = False + elif has_ligature_dashes: + # print "has ligature dashes" + use_dash_ligatures = True + # insert the setting i = find_token(document.header, "\\use_microtype", 0) if i != -1: - if document.initial_format > 474 and document.initial_format < 509: - # This was created by LyX 2.2 - document.header[i+1:i+1] = ["\\use_dash_ligatures false"] - else: - # This was created by LyX 2.1 or earlier - document.header[i+1:i+1] = ["\\use_dash_ligatures true"] - - i = 0 - while i < len(document.body): - words = document.body[i].split() - # Skip some document parts where dashes are not converted - if len(words) > 1 and words[0] == "\\begin_inset" and \ - words[1] in ["CommandInset", "ERT", "External", "Formula", \ - "FormulaMacro", "Graphics", "IPA", "listings"]: - j = find_end_of_inset(document.body, i) - if j == -1: - document.warning("Malformed LyX document: Can't find end of " \ - + words[1] + " inset at line " + str(i)) - i += 1 - else: - i = j - continue - if len(words) > 0 and words[0] in ["\\leftindent", \ - "\\paragraph_spacing", "\\align", "\\labelwidthstring"]: - i += 1 - continue - - start = 0 - while True: - j = document.body[i].find(u"\u2013", start) # en-dash - k = document.body[i].find(u"\u2014", start) # em-dash - if j == -1 and k == -1: - break - if j == -1 or (k != -1 and k < j): - j = k - after = document.body[i][j+1:] - if after.startswith(u"\u200B"): - document.body[i] = document.body[i][:j+1] + after[1:] - else: - if len(after) == 0 and document.body[i+1].startswith(u"\u200B"): - document.body[i+1] = document.body[i+1][1:] - break - start = j+1 - i += 1 - + document.header.insert(i+1, "\\use_dash_ligatures %s" + % str(use_dash_ligatures).lower()) def revert_dashligatures(document): - " Remove font ligature settings for en- and em-dashes. " + """ Remove font ligature settings for en- and em-dashes. + Revert conversion of \twodashes or \threedashes to literal dashes""" i = find_token(document.header, "\\use_dash_ligatures", 0) if i == -1: return use_dash_ligatures = get_bool_value(document.header, "\\use_dash_ligatures", i) del document.header[i] - use_non_tex_fonts = False i = find_token(document.header, "\\use_non_tex_fonts", 0) - if i != -1: + if i == -1: + use_non_tex_fonts = False + else: use_non_tex_fonts = get_bool_value(document.header, "\\use_non_tex_fonts", i) if not use_dash_ligatures or use_non_tex_fonts: return - - # Add a zero-length space (U+200B) after en- and em-dashes - i = 0 - while i < len(document.body): - words = document.body[i].split() - # Skip some document parts where dashes are not converted - if len(words) > 1 and words[0] == "\\begin_inset" and \ - words[1] in ["CommandInset", "ERT", "External", "Formula", \ - "FormulaMacro", "Graphics", "IPA", "listings"]: - j = find_end_of_inset(document.body, i) - if j == -1: - document.warning("Malformed LyX document: Can't find end of " \ - + words[1] + " inset at line " + str(i)) - i += 1 - else: - i = j - continue - if len(words) > 0 and words[0] in ["\\leftindent", \ - "\\paragraph_spacing", "\\align", "\\labelwidthstring"]: - i += 1 - continue - - start = 0 - while True: - j = document.body[i].find(u"\u2013", start) # en-dash - k = document.body[i].find(u"\u2014", start) # em-dash - if j == -1 and k == -1: - break - if j == -1 or (k != -1 and k < j): - j = k - after = document.body[i][j+1:] - document.body[i] = document.body[i][:j+1] + u"\u200B" + after - start = j+1 - i += 1 + new_body = [] + for line in document.body: + line = '\\twohyphens\n'.join(line.split(u'\u2013')) + line = '\\threehyphens\n'.join(line.split(u'\u2014')) + new_body.extend(line.split('\n')) + document.body = new_body