Hi, the following patch adds support for a fictitious file format called 249 which has the property that file encoding is utf-8.
The revert part is more difficult. I should revert to the encoding associated with the language, and then verify each character to guarantee that we don't loose any information in the transformation. So, what are we wainting to put unicode in head? ;-) -- José Abílio
Index: lyx2lyx_lang.py =================================================================== --- lyx2lyx_lang.py (revision 0) +++ lyx2lyx_lang.py (revision 0) @@ -0,0 +1,127 @@ +lang = {'afrikaans': ['afrikaans', 'Afrikaans', 'false', 'iso8859-1', 'af_ZA', ''], + 'american': ['american', 'American', 'false', 'iso8859-1', 'en_US', ''], + 'arabic': ['arabic', 'Arabic', 'true', 'iso8859-6', 'ar_SA', ''], + 'austrian': ['austrian', 'Austrian', 'false', 'iso8859-1', 'de_AT', ''], + 'bahasa': ['bahasa', 'Bahasa', 'false', 'iso8859-1', 'in_ID', ''], + 'basque': ['basque', 'Basque', 'false', 'false', 'iso8859-1', 'eu_ES', ''], + 'belarusian': ['belarusian', 'Belarusian', 'false', 'cp1251', 'be_BY', ''], + 'brazil': ['brazil', + 'Portuguese (Brazil)', + 'false', + 'iso8859-1', + 'pt_BR', + ''], + 'breton': ['breton', 'Breton', 'false', 'iso8859-1', 'br_FR', ''], + 'british': ['british', 'British', 'false', 'iso8859-1', 'en_GB', ''], + 'bulgarian': ['bulgarian', + 'Bulgarian', + 'false', + 'alse', + 'cp1251', + 'bg_BG', + ''], + 'canadian': ['canadian', 'Canadian', 'false', 'iso8859-1', 'en_CA', ''], + 'canadien': ['canadien', + 'French Canadian', + 'false', + 'iso8859-1', + 'fr_CA', + ''], + 'catalan': ['catalan', 'Catalan', 'false', 'iso8859-1', 'ca_ES', ''], + 'croatian': ['croatian', 'Croatian', 'false', 'iso8859-2', 'hr_HR', ''], + 'czech': ['czech', 'Czech', 'false', 'e', 'iso8859-2', 'cs_CZ', ''], + 'danish': ['danish', 'Danish', 'false', 'iso8859-1', 'da_DK', ''], + 'dutch': ['dutch', 'Dutch', 'false', 'e', 'iso8859-1', 'nl_NL', ''], + 'english': ['english', 'English', 'false', 'iso8859-1', 'en_US', ''], + 'esperanto': ['esperanto', 'Esperanto', 'false', 'iso8859-3', 'eo', ''], + 'estonian': ['estonian', 'Estonian', 'false', 'iso8859-1', 'et_EE', ''], + 'finnish': ['finnish', 'Finnish', 'false', 'iso8859-1', 'fi_FI', ''], + 'french': ['french', + 'French', + 'false', + 'iso8859-1', + 'fr_FR', + '\\addto\\extrasfrench{\\providecommand{\\og}{\\leavevmode\\flqq~}\\providecommand{\\fg}{\\ifdim\\lastskip>[EMAIL PROTECTED]'], + 'galician': ['galician', 'Galician', 'false', 'iso8859-1', 'gl_ES', ''], + 'german': ['german', 'German', 'false', 'iso8859-1', 'de_DE', ''], + 'greek': ['greek', 'Greek', 'false', 'e', 'iso8859-7', 'el_GR', ''], + 'hebrew': ['hebrew', 'Hebrew', 'true', 'cp1255', 'he_IL', ''], + 'icelandic': ['icelandic', + 'Icelandic', + 'false', + 'alse', + 'iso8859-1', + 'is_IS', + '', + ''], + 'irish': ['irish', 'Irish', 'false', 'e', 'iso8859-1', 'ga_IE', ''], + 'italian': ['italian', 'Italian', 'false', 'iso8859-1', 'it_IT', ''], + 'kazakh': ['kazakh', 'Kazakh', 'false', 'pt154', 'kk_KZ', ''], + 'latvian': ['latvian', 'Latvian', 'false', 'iso8859-13', 'lv_LV', ''], + 'lithuanian': ['lithuanian', + 'Lithuanian', + 'false', + 'iso8859-13', + 'lt_LT', + ''], + 'magyar': ['magyar', 'Magyar', 'false', 'iso8859-2', 'hu_HU', ''], + 'naustrian': ['naustrian', + 'Austrian (new spelling)', + 'false', + 'iso8859-1', + 'de_AT', + ''], + 'ngerman': ['ngerman', + 'German (new spelling)', + 'false', + 'iso8859-1', + 'de_DE', + ''], + 'norsk': ['norsk', 'Norsk', 'false', 'e', 'iso8859-1', 'no_NO', ''], + 'nynorsk': ['nynorsk', 'Nynorsk', 'false', 'false', 'iso8859-1', 'nn_NO', ''], + 'polish': ['polish', 'Polish', 'false', 'iso8859-2', 'pl_PL', ''], + 'portuges': ['portuges', 'Portugese', 'false', 'iso8859-1', 'pt_PT', ''], + 'romanian': ['romanian', 'Romanian', 'false', 'iso8859-2', 'ro_RO', ''], + 'russian': ['russian', 'Russian', 'false', 'koi8', 'ru_RU', ''], + 'scottish': ['scottish', 'Scottish', 'false', 'iso8859-1', 'gd_GB', ''], + 'serbian': ['croatian', 'Serbian', 'false', 'e', 'iso8859-5', 'sr_HR', ''], + 'serbocroatian': ['croatian', + 'Serbo-Croatian', + 'false', + 'iso8859-2', + 'sh_HR', + ''], + 'slovak': ['slovak', 'Slovak', 'false', 'iso8859-2', 'sk_SK', ''], + 'slovene': ['slovene', 'Slovene', 'false', 'iso8859-2', 'sl_SI', ''], + 'spanish': ['spanish', + 'Spanish', + 'false', + 'iso8859-1', + 'es_ES', + '\\deactivatetilden'], + 'swedish': ['swedish', 'Swedish', 'false', 'iso8859-1', 'sv_SE', ''], + 'thai': ['thai', + 'Thai', + 'false', + 'e', + 'tis620-0', + 'th_TH', + '\\usepackage{thswitch}'], + 'turkish': ['turkish', 'Turkish', 'false', 'iso8859-9', 'tr_TR', ''], + 'ukrainian': ['ukrainian', 'Ukrainian', 'false', 'koi8-u', 'uk_UA', ''], + 'welsh': ['welsh', 'Welsh', 'false', 'e', 'iso8859-1', 'cy_GB', '']} + +enc = {'cp1251': 'cp1251', + 'cp1255': 'cp1255', + 'iso-8859-7': 'iso8859-7', + 'iso88595': 'iso8859-5', + 'koi8-r': 'koi8', + 'koi8-u': 'koi8-u', + 'l7xenc': 'iso8859-13', + 'latin2': 'iso8859-2', + 'latin3': 'iso8859-3', + 'latin4': 'iso8859-4', + 'latin5': 'iso8859-9', + 'latin9': 'iso8859-15', + 'pt154': 'pt154', + 'unknown': 'tis620-0'} Index: LyX.py =================================================================== --- LyX.py (revision 14537) +++ LyX.py (working copy) @@ -60,7 +60,7 @@ ("1_2", [220], ["1.2.%d" % i for i in range(5)] + ["1.2"]), ("1_3", [221], ["1.3.%d" % i for i in range(8)] + ["1.3"]), ("1_4", range(222,246), ["1.4.0", "1.4.1", "1.4.2","1.4.3svn"]), - ("1_5", range(246,249), ["1.5.0svn","1.5"])] + ("1_5", range(246,250), ["1.5.0svn","1.5"])] def formats_list(): @@ -95,6 +95,14 @@ return line[:-1] +def get_encoding(language, inputencoding): + from lyx2lyx_lang import lang, enc + if inputencoding == "auto": + return lang[language][3] + if inputencoding == "default": + return "iso8859-1" + return enc[inputencoding] + ## # Class # @@ -189,21 +197,28 @@ self.header.append(line) + self.textclass = get_value(self.header, "\\textclass", 0) + self.backend = get_backend(self.textclass) + self.format = self.read_format() + self.language = get_value(self.header, "\\language", 0, default = "english") + self.inputencoding = get_value(self.header, "\\inputencoding", 0, default = "auto") + self.encoding = get_encoding(self.language, self.inputencoding) + self.initial_version = self.read_version() + + # Second pass over header and preamble, now we know the file encoding + for i in range(len(self.header)): + self.header[i] = self.header[i].decode(self.encoding) + for i in range(len(self.preamble)): + self.preamble[i] = self.preamble[i].decode(self.encoding) + + # Read document body while 1: - line = self.input.readline() + line = self.input.readline().decode(self.encoding) if not line: break self.body.append(trim_eol(line)) - self.textclass = get_value(self.header, "\\textclass", 0) - self.backend = get_backend(self.textclass) - self.format = self.read_format() - self.language = get_value(self.header, "\\language", 0) - if self.language == "": - self.language = "english" - self.initial_version = self.read_version() - def write(self): " Writes the LyX file to self.output." self.set_version() @@ -220,7 +235,7 @@ header = self.header for line in header + [''] + self.body: - self.output.write(line+"\n") + self.output.write(line.encode(self.encoding)+"\n") def choose_io(self, input, output): Index: parser_tools.py =================================================================== --- parser_tools.py (revision 14536) +++ parser_tools.py (working copy) @@ -141,8 +141,8 @@ return -1 -def get_value(lines, token, start, end = 0): - """ get_value(lines, token, start[, end]) -> list of strings +def get_value(lines, token, start, end = 0, default = ""): + """ get_value(lines, token, start[[, end], default]) -> list of strings Return tokens after token for the first line, in lines, where token is the first element.""" @@ -153,7 +153,7 @@ if len(lines[i].split()) > 1: return lines[i].split()[1] else: - return "" + return default def del_token(lines, token, start, end): Index: lyx_1_5.py =================================================================== --- lyx_1_5.py (revision 14537) +++ lyx_1_5.py (working copy) @@ -216,6 +216,15 @@ i = i + 1 +def convert_utf8(document): + i = find_token(document.header, "\\inputencoding", 0) + if i == -1: + document.header.append("\\inputencoding utf-8") + else: + document.header[i] = "\\inputencoding utf-8" + document.inputencoding = "utf-8" + document.encoding = "utf-8" + ## # Conversion hub # @@ -223,7 +232,9 @@ supported_versions = ["1.5.0svn","1.5"] convert = [[246, []], [247, [convert_font_settings]], - [248, []]] + [248, []], + [249, [convert_utf8]] + ] revert = [[247, [revert_booktabs]], [246, [revert_font_settings]],