Guenter Milde wrote: > Without special code for tex2lyx, \AA and \aa is the better > lib/unicodesymbols replacement, as there is a fallback for \r{A}, \r{a} > but not for \aa, \AA. > > With the special code, it is a matter of taste - maybe best decided by a > native speaker of a Scandinavian language using the character. > > I favour \AA \aa, as this is nearer to the Latin transliteration and the > old spelling in Danish.
After experimenting a bit I found out that it is possible to translate all variants without special code and without fallback if using \AA and \aa, so I changed my mind. > Also, instead of special tex2lyx code just for \aa and \AA, we could use > NFC (or NFKC) normalization¹ after the "LyXification" - this would not > only map \r{A} to 00C5 (via 0041 030A) but also other similar cases of > non-unique LICRs for pre-composed accented characters. I was about to write that we do not have code for doing that, but then a vague memory appeared that I dealt with normalization in the past, and we do indeed have normalize_c() in docstring.h, which does exactly what we want. The attached patch fixes all issues: - A new flag "deprecated" is introduced which makes it explicit that a certain symbol should not be used for LaTeX => char_type conversion. It is not strictly needed for U+00C5 vs. U+212B, since U+212B > U+00C5, but will be useful for other symbols. - \AA and \aa are used in lib/unicodesymbols exclusively. This adds \aa to the known symbols of tex2lyx - Apply normalization to precomposed form in tex2lyx for symbols from lib/unicodesymbols, since editing in LyX is easier with precomposed symbols. OK? Georg
diff --git a/lib/unicodesymbols b/lib/unicodesymbols index ff53354..a830b06 100644 --- a/lib/unicodesymbols +++ b/lib/unicodesymbols @@ -48,6 +48,7 @@ # - notermination=both Do not terminate this textcommand and mathcommand (by {} or space). # - notermination=none Always terminate this textcommand and mathcommand (by {} or space). # - tipashortcut=<shortcut> Shortcut notation for TIPA +# - deprecated Do not use this symbol for backwards conversion in LyX and tex2lyx. 0x00a0 "~" "" "notermination=both" "~" "" # NO-BREAK SPACE 0x00a1 "\\textexclamdown" "" "" # INVERTED EXCLAMATION MARK @@ -86,7 +87,7 @@ 0x00c2 "\\^{A}" "" "mathalpha" "\\hat{A}" # LATIN CAPITAL LETTER A WITH CIRCUMFLEX 0x00c3 "\\~{A}" "" "mathalpha" "\\tilde{A}" # LATIN CAPITAL LETTER A WITH TILDE 0x00c4 "\\\"{A}" "" "mathalpha" "\\ddot{A}" # LATIN CAPITAL LETTER A WITH DIAERESIS -0x00c5 "\\r{A}" "" "mathalpha" "\\mathring{A}" # LATIN CAPITAL LETTER A WITH RING ABOVE +0x00c5 "\\AA" "" "mathalpha" "\\mathring{A}" "" # LATIN CAPITAL LETTER A WITH RING ABOVE 0x00c6 "\\AE" "" "" # LATIN CAPITAL LETTER AE 0x00c7 "\\c{C}" "" "mathalpha" "\\cedilla{C}" "accents,cedilla" # LATIN CAPITAL LETTER C WITH CEDILLA 0x00c8 "\\`{E}" "" "mathalpha" "\\grave{E}" # LATIN CAPITAL LETTER E WITH GRAVE @@ -118,7 +119,7 @@ 0x00e2 "\\^{a}" "" "mathalpha" "\\hat{a}" # LATIN SMALL LETTER A WITH CIRCUMFLEX 0x00e3 "\\~{a}" "" "mathalpha" "\\tilde{a}" # LATIN SMALL LETTER A WITH TILDE 0x00e4 "\\\"{a}" "" "mathalpha" "\\ddot{a}" # LATIN SMALL LETTER A WITH DIAERESIS -0x00e5 "\\r{a}" "" "mathalpha" "\\mathring{a}" # LATIN SMALL LETTER A WITH RING ABOVE +0x00e5 "\\aa" "" "mathalpha" "\\mathring{a}" "" # LATIN SMALL LETTER A WITH RING ABOVE 0x00e6 "\\ae" "" "" # LATIN SMALL LETTER AE 0x00e7 "\\c{c}" "" "mathalpha" "\\cedilla{c}" "accents,cedilla" # LATIN SMALL LETTER C WITH CEDILLA 0x00e8 "\\`{e}" "" "mathalpha" "\\grave{e}" # LATIN SMALL LETTER E WITH GRAVE @@ -1882,7 +1883,7 @@ 0x2128 "" "" "" "\\mathfrak{Z}" "amssymb" # BLACK-LETTER CAPITAL Z #0x2129 "" "" "" "" "" # TURNED GREEK SMALL LETTER IOTA 0x212a "K" "" "notermination=text" "" "" # KELVIN SIGN -0x212b "\\AA" "" "force=utf8" "" "" # ANGSTROM SIGN +0x212b "\\AA" "" "force=utf8,deprecated" "" "" # ANGSTROM SIGN 0x212c "" "" "" "\\mathscr{B}" "mathrsfs" # SCRIPT CAPITAL B 0x212d "" "" "" "\\mathfrak{C}" "amssymb" # BLACK-LETTER CAPITAL C 0x212e "\\textestimated" "textcomp" "" # ESTIMATED SYMBOL diff --git a/src/Encoding.cpp b/src/Encoding.cpp index 0d1d116..73edc84 100644 --- a/src/Encoding.cpp +++ b/src/Encoding.cpp @@ -317,6 +317,8 @@ char_type Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype, CharInfoMap::const_iterator const end = unicodesymbols.end(); CharInfoMap::const_iterator it = unicodesymbols.begin(); for (combining = false; it != end; ++it) { + if (it->second.deprecated()) + continue; docstring const math = it->second.mathcommand(); docstring const text = it->second.textcommand(); if ((cmdtype & MATH_CMD) && math == cmd) { @@ -402,6 +404,8 @@ docstring Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype, size_t unicmd_size = 0; char_type c = 0; for (; it != uniend; ++it) { + if (it->second.deprecated()) + continue; docstring const math = mathmode ? it->second.mathcommand() : docstring(); docstring const text = textmode ? it->second.textcommand() @@ -722,6 +726,8 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) flags &= ~CharInfoMathNoTermination; } else if (contains(flag, "tipashortcut=")) { tipashortcut = split(flag, '='); + } else if (flag == "deprecated") { + flags |= CharInfoDeprecated; } else { lyxerr << "Ignoring unknown flag `" << flag << "' for symbol `0x" diff --git a/src/Encoding.h b/src/Encoding.h index f1513c8..aac632e 100644 --- a/src/Encoding.h +++ b/src/Encoding.h @@ -52,6 +52,8 @@ enum CharInfoFlags { CharInfoMathNoTermination = 32, /// CharInfoForceSelected = 64, + /// + CharInfoDeprecated = 128 }; @@ -86,6 +88,8 @@ public: bool force() const { return flags_ & CharInfoForce ? true : false; } /// Force the LaTeX command for some encodings? bool forceselected() const { return flags_ & CharInfoForceSelected ? true : false; } + /// Disable LaTeX command => char_type conversion for this deprecated symbol? + bool deprecated() const { return flags_ & CharInfoDeprecated ? true : false; } /// TIPA shortcut std::string const tipashortcut() const { return tipashortcut_; } /// \c textcommand needs no termination (such as {} or space). diff --git a/src/tex2lyx/test/test-insets.lyx.lyx b/src/tex2lyx/test/test-insets.lyx.lyx index ff3129b..e8a67b1 100644 --- a/src/tex2lyx/test/test-insets.lyx.lyx +++ b/src/tex2lyx/test/test-insets.lyx.lyx @@ -6907,29 +6907,7 @@ Other symbols \end_layout \begin_layout Standard -All three should be converted to U+00C5: ⫠à à (not U+212B). All three should be converted to U+00E5: -\begin_inset ERT -status collapsed - -\begin_layout Plain Layout - -\backslash -aa -\end_layout - -\end_inset - - -\begin_inset ERT -status collapsed - -\begin_layout Plain Layout -{} -\end_layout - -\end_inset - - å å. +All three should be converted to U+00C5: à à à (not U+212B). All three should be converted to U+00E5: å å å. \end_layout \begin_layout Subsection diff --git a/src/tex2lyx/text.cpp b/src/tex2lyx/text.cpp index 73a8d69..ce330a2 100644 --- a/src/tex2lyx/text.cpp +++ b/src/tex2lyx/text.cpp @@ -521,8 +521,8 @@ docstring convert_unicodesymbols(docstring s) bool termination; docstring rem; set<string> req; - docstring parsed = encodings.fromLaTeXCommand(s, - Encodings::TEXT_CMD, termination, rem, &req); + docstring parsed = normalize_c(encodings.fromLaTeXCommand(s, + Encodings::TEXT_CMD, termination, rem, &req)); set<string>::const_iterator it = req.begin(); set<string>::const_iterator en = req.end(); for (; it != en; ++it) @@ -4824,8 +4824,8 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer, bool termination; docstring rem; set<string> req; - docstring s = encodings.fromLaTeXCommand(from_utf8(name), - Encodings::TEXT_CMD, termination, rem, &req); + docstring s = normalize_c(encodings.fromLaTeXCommand(from_utf8(name), + Encodings::TEXT_CMD, termination, rem, &req)); if (!s.empty()) { context.check_layout(os); os << to_utf8(s);