On 2017-08-31, Guenter Milde wrote:

Dear LyX developers,

is there a chance to fix the following problems instead of just
documenting them?

>  lyx2lyx deletes ZWSP characters following literal em- and en-dashes when
>  converting to 2.3 format. If you used literal ZWSP characters (u200b) as
>  optional line breaks after dashes, convert them to 0dd wide space insets
>  before opening your document with LyX 2.3 or the optional line breaks will
>  be lost!

...

>   If using TeX fonts and en- and em-dashes are output as font ligatures,
>   when exporting documents containing en- and em-dashes to the format of
>   LyX 2.0 or earlier, the following line has to be manually added to the
>   unicodesymbols file of that LyX version:<br>
>   0x200b "\\hspace{0pt}" "" "" "" "" # ZERO WIDTH SPACE<br>
>   This avoids "uncodable character" issues if the document is actually
>   loaded by that LyX version. LyX 2.1 and later versions already have the
>   necessary definition in their unicodesymbols file

Export to 2.1 and older changes dashes from ligature to literal.


Proposed changes (see patch below):

1. detect presence of literal as well as ligature dashes in old documents and
   use it for setting \use_dash_ligatures:

   \use_dash_ligatures = default     if none are found
  
   \use_dash_ligatures = True        if ligature dash(s) (\t*hyphens) found
  
   \use_dash_ligatures = False       if literal dash(s) found
  
   \use_dash_ligatures = default     if both are found (+ issue a warning)
   
2. back-convert dashes to \twohyphens / \threehyphens if   
   \use_dash_ligatures = True 
   
   This means 
   
   * unchanged behaviour with LyX <= 2.1
   * unchanged behaviour after round-trip (unless edited with 2.2)
   
3. do not use invisible optional line break characters (ZWSP) in backwards
   conversion.


Günter


diff --git a/lib/lyx2lyx/lyx_2_3.py b/lib/lyx2lyx/lyx_2_3.py
index 73ac45cf00..6305d417b0 100644
--- a/lib/lyx2lyx/lyx_2_3.py
+++ b/lib/lyx2lyx/lyx_2_3.py
@@ -1841,103 +1841,57 @@ def revert_chapterbib(document):


 def convert_dashligatures(document):
-    " Remove a zero-length space (U+200B) after en- and em-dashes. "
-
+    " Set use_dash_ligatures according to content (literal vs. 'ligature' 
dashes) "
+    # Default:
+    use_dash_ligatures = False # TODO: Get the default from stdtemplate.lyx
+    # Look for dashes (followed by a word or no-break space):
+    # (Documents by LyX 2.1 or older have "\twohyphens\n" or "\threehyphens\n"
+    # as interim representation for dash ligatures in 2.2.)
+    has_literal_dashes = has_ligature_dashes = False
+    for i, line in enumerate(document.body):
+        if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line, flags=re.UNICODE):
+            has_literal_dashes = True
+        if re.search(ur"(\\twohyphens|\\threehyphens)", line, 
flags=re.UNICODE):
+            # print "dash in line ", i, document.body[i+1].encode('utf8')
+            if re.match(u"[\w\u00A0]", document.body[i+1], flags=re.UNICODE):
+                has_ligature_dashes = True
+    if has_literal_dashes and has_ligature_dashes:
+        # TODO: insert a warning note in the document?
+        document.warning("""This document contained both literal and 
"ligature" dashes.
+            Line break may have changed. See UserGuide chapter 3.9.1 for 
details.""")
+    elif has_literal_dashes:
+        # print "has literal dashes"
+        use_dash_ligatures = False
+    elif has_ligature_dashes:
+        # print "has ligature dashes"
+        use_dash_ligatures = True
+    # insert the setting
     i = find_token(document.header, "\\use_microtype", 0)
     if i != -1:
-        if document.initial_format > 474 and document.initial_format < 509:
-            # This was created by LyX 2.2
-            document.header[i+1:i+1] = ["\\use_dash_ligatures false"]
-        else:
-            # This was created by LyX 2.1 or earlier
-            document.header[i+1:i+1] = ["\\use_dash_ligatures true"]
-
-    i = 0
-    while i < len(document.body):
-        words = document.body[i].split()
-        # Skip some document parts where dashes are not converted
-        if len(words) > 1 and words[0] == "\\begin_inset" and \
-           words[1] in ["CommandInset", "ERT", "External", "Formula", \
-                        "FormulaMacro", "Graphics", "IPA", "listings"]:
-            j = find_end_of_inset(document.body, i)
-            if j == -1:
-                document.warning("Malformed LyX document: Can't find end of " \
-                                 + words[1] + " inset at line " + str(i))
-                i += 1
-            else:
-                i = j
-            continue
-        if len(words) > 0 and words[0] in ["\\leftindent", \
-                "\\paragraph_spacing", "\\align", "\\labelwidthstring"]:
-            i += 1
-            continue
-
-        start = 0
-        while True:
-            j = document.body[i].find(u"\u2013", start) # en-dash
-            k = document.body[i].find(u"\u2014", start) # em-dash
-            if j == -1 and k == -1:
-                break
-            if j == -1 or (k != -1 and k < j):
-                j = k
-            after = document.body[i][j+1:]
-            if after.startswith(u"\u200B"):
-                document.body[i] = document.body[i][:j+1] + after[1:]
-            else:
-                if len(after) == 0 and 
document.body[i+1].startswith(u"\u200B"):
-                    document.body[i+1] = document.body[i+1][1:]
-                    break
-            start = j+1
-        i += 1
-
+        document.header.insert(i+1, "\\use_dash_ligatures %s"
+                               % str(use_dash_ligatures).lower())

 def revert_dashligatures(document):
-    " Remove font ligature settings for en- and em-dashes. "
+    """ Remove font ligature settings for en- and em-dashes.
+    Revert conversion of \twodashes or \threedashes to literal dashes"""
     i = find_token(document.header, "\\use_dash_ligatures", 0)
     if i == -1:
         return
     use_dash_ligatures = get_bool_value(document.header, 
"\\use_dash_ligatures", i)
     del document.header[i]
-    use_non_tex_fonts = False
     i = find_token(document.header, "\\use_non_tex_fonts", 0)
-    if i != -1:
+    if i == -1:
+        use_non_tex_fonts = False
+    else:
         use_non_tex_fonts = get_bool_value(document.header, 
"\\use_non_tex_fonts", i)
     if not use_dash_ligatures or use_non_tex_fonts:
         return
-
-    # Add a zero-length space (U+200B) after en- and em-dashes
-    i = 0
-    while i < len(document.body):
-        words = document.body[i].split()
-        # Skip some document parts where dashes are not converted
-        if len(words) > 1 and words[0] == "\\begin_inset" and \
-           words[1] in ["CommandInset", "ERT", "External", "Formula", \
-                        "FormulaMacro", "Graphics", "IPA", "listings"]:
-            j = find_end_of_inset(document.body, i)
-            if j == -1:
-                document.warning("Malformed LyX document: Can't find end of " \
-                                 + words[1] + " inset at line " + str(i))
-                i += 1
-            else:
-                i = j
-            continue
-        if len(words) > 0 and words[0] in ["\\leftindent", \
-                "\\paragraph_spacing", "\\align", "\\labelwidthstring"]:
-            i += 1
-            continue
-
-        start = 0
-        while True:
-            j = document.body[i].find(u"\u2013", start) # en-dash
-            k = document.body[i].find(u"\u2014", start) # em-dash
-            if j == -1 and k == -1:
-                break
-            if j == -1 or (k != -1 and k < j):
-                j = k
-            after = document.body[i][j+1:]
-            document.body[i] = document.body[i][:j+1] + u"\u200B" + after
-            start = j+1
-        i += 1
+    new_body = []
+    for line in document.body:
+        line = '\\twohyphens\n'.join(line.split(u'\u2013'))
+        line = '\\threehyphens\n'.join(line.split(u'\u2014'))
+        new_body.extend(line.split('\n'))
+    document.body = new_body


Reply via email to