Jean-Marc Lasgouttes <lasgout...@lyx.org> writes:
> I reverted everything in my local tree and applied first the stuff that
> is not really releated (and which moves files around). I still plan to
> apply the code changes in one big chunk.

OK, I am done (did not test yet but it does compile and it seems that
the only differences to trunk are cosmetic).

The remaining patch is as follows. Testing welcome.

For reference, the corresponding changesets are
http://www.lyx.org/trac/changeset/27454
http://www.lyx.org/trac/changeset/27457
http://www.lyx.org/trac/changeset/27458
http://www.lyx.org/trac/changeset/27481
http://www.lyx.org/trac/changeset/27486
http://www.lyx.org/trac/changeset/27489
http://www.lyx.org/trac/changeset/27547
http://www.lyx.org/trac/changeset/27563
http://www.lyx.org/trac/changeset/27592
http://www.lyx.org/trac/changeset/27596
http://www.lyx.org/trac/changeset/27599
http://www.lyx.org/trac/changeset/27721

JMarc

svndiff

Index: src/Encoding.cpp
===================================================================
--- src/Encoding.cpp	(revision 27910)
+++ src/Encoding.cpp	(working copy)
@@ -513,6 +513,10 @@ docstring Encodings::fromLaTeXCommand(do
 
 void Encodings::initUnicodeMath(Buffer const & buffer)
 {
+#ifdef TEX2LYX
+	// The code below is not needed in tex2lyx and requires additional stuff
+	(void)buffer;
+#else
 	mathcmd.clear();
 	textcmd.clear();
 	mathsym.clear();
@@ -523,11 +527,18 @@ void Encodings::initUnicodeMath(Buffer c
 
 	for (; it != end; ++it)
 		it->initUnicodeMath();
+#endif
 }
 
 
 void Encodings::validate(char_type c, LaTeXFeatures & features, bool for_mathed)
 {
+#ifdef TEX2LYX
+	// The code below is not needed in tex2lyx and requires additional stuff
+	(void)c;
+	(void)features;
+	(void)for_mathed;
+#else
 	CharInfoMap::const_iterator const it = unicodesymbols.find(c);
 	if (it != unicodesymbols.end()) {
 		// In mathed, c could be used both in textmode and mathmode
@@ -566,6 +577,7 @@ void Encodings::validate(char_type c, La
 		features.require("relsize");
 		features.require("lyxmathsym");
 	}
+#endif
 }
 
 
Index: src/tex2lyx/test/test.ltx
===================================================================
--- src/tex2lyx/test/test.ltx	(revision 27910)
+++ src/tex2lyx/test/test.ltx	(working copy)
@@ -75,6 +75,12 @@ foo & bar \\
 bar & foo
 \end{tabular}
 
+Let's try a few unicode characters: the (R) symbol \textregistered
+(and the same one with braces \textregistered{} and a space after) or
+maybe an accented a \'{a} or this one \'a or this \^\i.
+
+Watch out: \textregistered should be glued to its successor here.
+
 Final Text.
 \end{document}
 
Index: src/tex2lyx/Parser.h
===================================================================
--- src/tex2lyx/Parser.h	(revision 27910)
+++ src/tex2lyx/Parser.h	(working copy)
@@ -12,10 +12,11 @@
 #ifndef PARSER_H
 #define PARSER_H
 
-#include <vector>
 #include <string>
 #include <utility>
+#include <vector>
 
+#include "support/docstream.h"
 
 namespace lyx {
 
@@ -46,9 +47,6 @@ enum CatCode {
 };
 
 
-CatCode catcode(unsigned char c);
-
-
 enum {
 	FLAG_BRACE_LAST = 1 << 1,  //  last closing brace ends the parsing
 	FLAG_RIGHT      = 1 << 2,  //  next \\right ends the parsing process
@@ -75,18 +73,16 @@ enum {
 class Token {
 public:
 	///
-	Token() : cs_(), char_(0), cat_(catIgnore) {}
-	///
-	Token(char c, CatCode cat) : cs_(), char_(c), cat_(cat) {}
+	Token() : cs_(), cat_(catIgnore) {}
 	///
-	Token(std::string const & cs, CatCode cat) : cs_(cs), char_(0), cat_(cat) {}
+	Token(docstring const & cs, CatCode cat) : cs_(to_utf8(cs)), cat_(cat) {}
 
 	///
 	std::string const & cs() const { return cs_; }
 	/// Returns the catcode of the token
 	CatCode cat() const { return cat_; }
 	///
-	char character() const { return char_; }
+	char character() const { return cs_.empty() ? 0 : cs_[0]; }
 	/// Returns the token as string
 	std::string asString() const;
 	/// Returns the token verbatim
@@ -96,8 +92,6 @@ private:
 	///
 	std::string cs_;
 	///
-	char char_;
-	///
 	CatCode cat_;
 };
 
@@ -119,9 +113,14 @@ class Parser {
 
 public:
 	///
-	Parser(std::istream & is);
+	Parser(idocstream & is);
 	///
 	Parser(std::string const & s);
+	///
+	~Parser();
+
+	/// change the encoding of the input stream
+	void setEncoding(std::string const & encoding);
 
 	///
 	int lineno() const { return lineno_; }
@@ -174,8 +173,8 @@ public:
 	char getChar();
 	///
 	void error(std::string const & msg);
-	/// Parses \p is into tokens
-	void tokenize(std::istream & is);
+	/// Parses one token from \p is 
+	void tokenize_one();
 	///
 	void push_back(Token const & t);
 	/// The previous token.
@@ -183,11 +182,11 @@ public:
 	/// The current token.
 	Token const & curr_token() const;
 	/// The next token.
-	Token const & next_token() const;
+	Token const & next_token();
 	/// Make the next token current and return that.
 	Token const & get_token();
 	/// \return whether the current token starts a new paragraph
-	bool isParagraph() const;
+	bool isParagraph();
 	/// skips spaces (and comments if \p skip_comments is true)
 	void skip_spaces(bool skip_comments = false);
 	/// puts back spaces (and comments if \p skip_comments is true)
@@ -195,7 +194,7 @@ public:
 	///
 	void lex(std::string const & s);
 	///
-	bool good() const;
+	bool good();
 	///
 	std::string verbatim_item();
 	///
@@ -214,6 +213,10 @@ private:
 	std::vector<Token> tokens_;
 	///
 	unsigned pos_;
+	///
+	idocstringstream * iss_;
+	///
+	idocstream & is_;
 };
 
 
Index: src/tex2lyx/tex2lyx.cpp
===================================================================
--- src/tex2lyx/tex2lyx.cpp	(revision 27910)
+++ src/tex2lyx/tex2lyx.cpp	(working copy)
@@ -15,20 +15,20 @@
 #include "tex2lyx.h"
 
 #include "Context.h"
-#include "TextClass.h"
+#include "Encoding.h"
 #include "Layout.h"
+#include "TextClass.h"
 
-#include "support/lassert.h"
 #include "support/convert.h"
 #include "support/debug.h"
 #include "support/ExceptionMessage.h"
 #include "support/filetools.h"
+#include "support/lassert.h"
 #include "support/lstrings.h"
 #include "support/os.h"
 #include "support/Package.h"
 
 #include <cstdlib>
-#include <fstream>
 #include <iostream>
 #include <string>
 #include <sstream>
@@ -202,7 +202,7 @@ void read_environment(Parser & p, string
  */
 void read_syntaxfile(FileName const & file_name)
 {
-	ifstream is(file_name.toFilesystemEncoding().c_str());
+	ifdocstream is(file_name.toFilesystemEncoding().c_str());
 	if (!is.good()) {
 		cerr << "Could not open syntax file \"" << file_name
 		     << "\" for reading." << endl;
@@ -389,7 +389,7 @@ namespace {
  *  You must ensure that \p parentFilePath is properly set before calling
  *  this function!
  */
-void tex2lyx(istream & is, ostream & os)
+void tex2lyx(idocstream & is, ostream & os)
 {
 	Parser p(is);
 	//p.dump();
@@ -411,7 +411,7 @@ void tex2lyx(istream & is, ostream & os)
 	os << ss.str();
 #ifdef TEST_PARSER
 	p.reset();
-	ofstream parsertest("parsertest.tex");
+	ofdocstream parsertest("parsertest.tex");
 	while (p.good())
 		parsertest << p.get_token().asInput();
 	// <origfile> and parsertest.tex should now have identical content
@@ -422,7 +422,10 @@ void tex2lyx(istream & is, ostream & os)
 /// convert TeX from \p infilename to LyX and write it to \p os
 bool tex2lyx(FileName const & infilename, ostream & os)
 {
-	ifstream is(infilename.toFilesystemEncoding().c_str());
+	ifdocstream is;
+	// forbid buffering on this stream
+	is.rdbuf()->pubsetbuf(0,0);
+	is.open(infilename.toFilesystemEncoding().c_str());
 	if (!is.good()) {
 		cerr << "Could not open input file \"" << infilename
 		     << "\" for reading." << endl;
@@ -485,11 +488,11 @@ int main(int argc, char * argv[])
 	os::init(argc, argv);
 
 	try { init_package(internal_path(to_utf8(from_local8bit(argv[0]))),
-		cl_system_support, cl_user_support,
-		top_build_dir_is_two_levels_up);
+			     cl_system_support, cl_user_support,
+			     top_build_dir_is_two_levels_up);
 	} catch (ExceptionMessage const & message) {
 		cerr << to_utf8(message.title_) << ":\n"
-			<< to_utf8(message.details_) << endl;
+		     << to_utf8(message.details_) << endl;
 		if (message.type_ == ErrorException)
 			exit(1);
 	}
@@ -507,6 +510,7 @@ int main(int argc, char * argv[])
 	} else
 		outfilename = changeExtension(infilename, ".lyx");
 
+	// Read the syntax tables
 	FileName const system_syntaxfile = libFileSearch("", "syntax.default");
 	if (system_syntaxfile.empty()) {
 		cerr << "Error: Could not find syntax file \"syntax.default\"." << endl;
@@ -516,9 +520,24 @@ int main(int argc, char * argv[])
 	if (!syntaxfile.empty())
 		read_syntaxfile(makeAbsPath(syntaxfile));
 
+	// Read the encodings table.
+	FileName const symbols_path = libFileSearch(string(), "unicodesymbols");
+	if (symbols_path.empty()) {
+		cerr << "Error: Could not find file \"unicodesymbols\"." 
+		     << endl;
+		exit(1);
+	}
+	FileName const enc_path = libFileSearch(string(), "encodings");
+	if (enc_path.empty()) {
+		cerr << "Error: Could not find file \"encodings\"." 
+		     << endl;
+		exit(1);
+	}
+	encodings.read(enc_path, symbols_path);
+
+	// The real work now.
 	masterFilePath = onlyPath(infilename);
 	parentFilePath = masterFilePath;
-
 	if (outfilename == "-") {
 		if (tex2lyx(FileName(infilename), cout))
 			return EXIT_SUCCESS;
Index: src/tex2lyx/text.cpp
===================================================================
--- src/tex2lyx/text.cpp	(revision 27910)
+++ src/tex2lyx/text.cpp	(working copy)
@@ -17,6 +17,7 @@
 #include "tex2lyx.h"
 
 #include "Context.h"
+#include "Encoding.h"
 #include "FloatList.h"
 #include "Layout.h"
 #include "Length.h"
@@ -509,7 +510,7 @@ void output_command_layout(ostream & os,
  * The drawback is that the logic inside the function becomes
  * complicated, and that is the reason why it is not implemented.
  */
-void check_space(Parser const & p, ostream & os, Context & context)
+void check_space(Parser & p, ostream & os, Context & context)
 {
 	Token const next = p.next_token();
 	Token const curr = p.curr_token();
@@ -1262,7 +1263,7 @@ void parse_text(Parser & p, ostream & os
 			       t.cat() == catParameter) {
 			// This translates "&" to "\\&" which may be wrong...
 			context.check_layout(os);
-			os << t.character();
+			os << t.cs();
 		}
 
 		else if (p.isParagraph()) {
@@ -1281,7 +1282,7 @@ void parse_text(Parser & p, ostream & os
 				else
 					os << "\\InsetSpace ~\n";
 			} else
-				os << t.character();
+				os << t.cs();
 		}
 
 		else if (t.cat() == catBegin &&
@@ -1309,7 +1310,7 @@ void parse_text(Parser & p, ostream & os
 			    next.character() == '*') {
 				p.get_token();
 				if (p.next_token().cat() == catEnd) {
-					os << next.character();
+					os << next.cs();
 					p.get_token();
 				} else {
 					p.putback();
@@ -1552,8 +1553,9 @@ void parse_text(Parser & p, ostream & os
 			TeXFont const oldFont = context.font;
 			// save the current font size
 			string const size = oldFont.size;
-			// reset the font size to default, because the font size switches don't
-			// affect section headings and the like
+			// reset the font size to default, because the
+			// font size switches don't affect section
+			// headings and the like
 			context.font.size = known_coded_sizes[0];
 			output_font_change(os, oldFont, context.font);
 			// write the layout
@@ -1763,6 +1765,7 @@ void parse_text(Parser & p, ostream & os
 			p.skip_spaces();
 			context.check_layout(os);
 			string const s = p.verbatim_item();
+			//FIXME: this never triggers in UTF8
 			if (s == "\xb1" || s == "\xb3" || s == "\xb2" || s == "\xb5")
 				os << s;
 			else
@@ -2127,25 +2130,31 @@ void parse_text(Parser & p, ostream & os
 
 		else if (t.cs() == "selectlanguage") {
 			context.check_layout(os);
-			// save the language for the case that a \foreignlanguage is used 
+			// save the language for the case that a
+			// \foreignlanguage is used 
+
+			//FIXME: this is wrong, the language should
+			// be saved in the context. (JMarc)
 			selectlang = subst(p.verbatim_item(), "\n", " ");
 			os << "\\lang " << selectlang << "\n";
-			
 		}
 
 		else if (t.cs() == "foreignlanguage") {
 			context.check_layout(os);
 			os << "\n\\lang " << subst(p.verbatim_item(), "\n", " ") << "\n";
 			os << subst(p.verbatim_item(), "\n", " ");
+			// FIXME: the second argument of selectlanguage
+			// has to be parsed (like for \textsf, for
+			// example). 
 			// set back to last selectlanguage
 			os << "\n\\lang " << selectlang << "\n";
 		}
 
-		else if (t.cs() == "inputencoding")
-			// write nothing because this is done by LyX using the "\lang"
-			// information given by selectlanguage and foreignlanguage
-			subst(p.verbatim_item(), "\n", " ");
-		
+		else if (t.cs() == "inputencoding") {
+			// nothing to write here
+			string const enc = subst(p.verbatim_item(), "\n", " ");
+			p.setEncoding(enc);
+		}
 		else if (t.cs() == "LyX" || t.cs() == "TeX"
 			 || t.cs() == "LaTeX") {
 			context.check_layout(os);
@@ -2238,18 +2247,6 @@ void parse_text(Parser & p, ostream & os
 			handle_ert(os, oss.str(), context);
 		}
 
-		else if (t.cs() == "\"") {
-			context.check_layout(os);
-			string const name = p.verbatim_item();
-			     if (name == "a") os << '\xe4';
-			else if (name == "o") os << '\xf6';
-			else if (name == "u") os << '\xfc';
-			else if (name == "A") os << '\xc4';
-			else if (name == "O") os << '\xd6';
-			else if (name == "U") os << '\xdc';
-			else handle_ert(os, "\"{" + name + "}", context);
-		}
-
 		// Problem: \= creates a tabstop inside the tabbing environment
 		// and else an accent. In the latter case we really would want
 		// \={o} instead of \= o.
@@ -2260,30 +2257,22 @@ void parse_text(Parser & p, ostream & os
 			 || t.cs() == "'" || t.cs() == "`"
 			 || t.cs() == "~" || t.cs() == "." || t.cs() == "=") {
 			// we need the trim as the LyX parser chokes on such spaces
-			// The argument of InsetLatexAccent is parsed as a
-			// subset of LaTeX, so don't parse anything here,
-			// but use the raw argument.
-			// Otherwise we would convert \~{\i} wrongly.
-			// This will of course not translate \~{\ss} to \~{ß},
-			// but that does at least compile and does only look
-			// strange on screen.
-			context.check_layout(os);
-			os << "\\i \\" << t.cs() << "{"
-			   << trim(p.verbatim_item(), " ")
-			   << "}\n";
-		}
-
-		else if (t.cs() == "ss") {
-			context.check_layout(os);
-			os << "\xdf";
-			skip_braces(p); // eat {}
-		}
-
-		else if (t.cs() == "i" || t.cs() == "j" || t.cs() == "l" ||
-			 t.cs() == "L") {
 			context.check_layout(os);
-			os << "\\i \\" << t.cs() << "{}\n";
-			skip_braces(p); // eat {}
+			// try to see whether the string is in unicodesymbols
+			docstring rem;
+			string command = t.asInput() + "{" 
+				+ trim(p.verbatim_item())
+				+ "}";
+			docstring s = encodings.fromLaTeXCommand(from_utf8(command), rem);
+			if (!s.empty()) {
+				if (!rem.empty())
+					cerr << "When parsing " << command 
+					     << ", result is " << to_utf8(s)
+					     << "+" << to_utf8(rem) << endl;
+				os << to_utf8(s);
+			} else
+				// we did not find a non-ert version
+				handle_ert(os, command, context);
 		}
 
 		else if (t.cs() == "\\") {
@@ -2537,6 +2526,19 @@ void parse_text(Parser & p, ostream & os
 		}
 
 		else {
+			// try to see whether the string is in unicodesymbols
+			docstring rem;
+			docstring s = encodings.fromLaTeXCommand(from_utf8(t.asInput()), rem);
+			if (!s.empty()) {
+				if (!rem.empty())
+					cerr << "When parsing " << t.cs() 
+					     << ", result is " << to_utf8(s)
+					     << "+" << to_utf8(rem) << endl;
+				context.check_layout(os);
+				os << to_utf8(s);
+				p.skip_spaces();
+				skip_braces(p); // eat {}
+			}
 			//cerr << "#: " << t << " mode: " << mode << endl;
 			// heuristic: read up to next non-nested space
 			/*
@@ -2550,14 +2552,16 @@ void parse_text(Parser & p, ostream & os
 			cerr << "found ERT: " << s << endl;
 			handle_ert(os, s + ' ', context);
 			*/
-			string name = t.asInput();
-			if (p.next_token().asInput() == "*") {
-				// Starred commands like \vspace*{}
-				p.get_token();				// Eat '*'
-				name += '*';
+			else {
+				string name = t.asInput();
+				if (p.next_token().asInput() == "*") {
+					// Starred commands like \vspace*{}
+					p.get_token();	// Eat '*'
+					name += '*';
+				}
+				if (!parse_command(name, p, os, outer, context))
+					handle_ert(os, name, context);
 			}
-			if (! parse_command(name, p, os, outer, context))
-				handle_ert(os, name, context);
 		}
 
 		if (flags & FLAG_LEAVE) {
Index: src/tex2lyx/table.cpp
===================================================================
--- src/tex2lyx/table.cpp	(revision 27910)
+++ src/tex2lyx/table.cpp	(working copy)
@@ -661,16 +661,15 @@ void parse_table(Parser & p, ostream & o
 			}
 		}
 
-		else if (t.cat() == catSpace || t.cat() == catNewline)
-				os << t.cs();
-
-		else if (t.cat() == catLetter ||
-			       t.cat() == catSuper ||
-			       t.cat() == catSub ||
-			       t.cat() == catOther ||
-			       t.cat() == catActive ||
-			       t.cat() == catParameter)
-			os << t.character();
+		else if (t.cat() == catSpace 
+			 || t.cat() == catNewline
+			 || t.cat() == catLetter 
+			 || t.cat() == catSuper 
+			 || t.cat() == catSub 
+			 || t.cat() == catOther 
+			 || t.cat() == catActive 
+			 || t.cat() == catParameter)
+			os << t.cs();
 
 		else if (t.cat() == catBegin) {
 			os << '{';
Index: src/tex2lyx/preamble.cpp
===================================================================
--- src/tex2lyx/preamble.cpp	(revision 27910)
+++ src/tex2lyx/preamble.cpp	(working copy)
@@ -248,7 +248,7 @@ string const scale_as_percentage(string 
 }
 
 
-void handle_package(string const & name, string const & opts,
+void handle_package(Parser &p, string const & name, string const & opts,
 		    bool in_lyx_preamble)
 {
 	vector<string> options = split_options(opts);
@@ -326,9 +326,10 @@ void handle_package(string const & name,
 		; // ignore this
 
 	else if (name == "inputenc") {
-		// only set when there is not more than one inputenc option
-		// therefore check for the "," character
-		// also only set when there is not more then one babel language option
+		// only set when there is not more than one inputenc
+		// option therefore check for the "," character also
+		// only set when there is not more then one babel
+		// language option
 		if (opts.find(",") == string::npos && one_language == true) {
 			if (opts == "ascii")
 				//change ascii to auto to be in the unicode range, see
@@ -337,6 +338,8 @@ void handle_package(string const & name,
 			else if (!opts.empty())
 				h_inputencoding = opts;
 		}
+		if (!options.empty())
+			p.setEncoding(options.back());
 		options.clear();
 	}
 
@@ -413,7 +416,7 @@ void handle_package(string const & name,
 void end_preamble(ostream & os, TextClass const & /*textclass*/)
 {
 	os << "#LyX file created by tex2lyx " << PACKAGE_VERSION << "\n"
-	   << "\\lyxformat 247\n"
+	   << "\\lyxformat 249\n"
 	   << "\\begin_document\n"
 	   << "\\begin_header\n"
 	   << "\\textclass " << h_textclass << "\n";
@@ -663,21 +666,19 @@ void parse_preamble(Parser & p, ostream 
 		else if (t.cs() == "usepackage") {
 			string const options = p.getArg('[', ']');
 			string const name = p.getArg('{', '}');
-			if (options.empty() && name.find(',')) {
-				vector<string> vecnames;
-				split(name, vecnames, ',');
-				vector<string>::const_iterator it  = vecnames.begin();
-				vector<string>::const_iterator end = vecnames.end();
-				for (; it != end; ++it)
-					handle_package(trim(*it), string(), 
-						       in_lyx_preamble);
-			} else {
-				handle_package(name, options, in_lyx_preamble);
-			}
+			vector<string> vecnames;
+			split(name, vecnames, ',');
+			vector<string>::const_iterator it  = vecnames.begin();
+			vector<string>::const_iterator end = vecnames.end();
+			for (; it != end; ++it)
+				handle_package(p, trim(*it), options, 
+					       in_lyx_preamble);
 		}
 
 		else if (t.cs() == "inputencoding") {
-			h_inputencoding = p.getArg('{','}');
+			string const encoding = p.getArg('{','}');
+			h_inputencoding = encoding;
+			p.setEncoding(encoding);
 		}
 
 		else if (t.cs() == "newenvironment") {
Index: src/tex2lyx/Makefile.am
===================================================================
--- src/tex2lyx/Makefile.am	(revision 27911)
+++ src/tex2lyx/Makefile.am	(working copy)
@@ -41,7 +41,8 @@ LINKED_FILES = \
 	../Lexer.cpp \
 	../lengthcommon.cpp \
 	../Color.cpp \
-	../Color.h
+	../Color.h \
+	../Encoding.cpp
 
 BUILT_SOURCES = $(PCH_FILE)
 
Index: src/tex2lyx/math.cpp
===================================================================
--- src/tex2lyx/math.cpp	(revision 27910)
+++ src/tex2lyx/math.cpp	(working copy)
@@ -94,7 +94,7 @@ void parse_math(Parser & p, ostream & os
 			       t.cat() == catAlign ||
 			       t.cat() == catActive ||
 			       t.cat() == catParameter)
-			os << t.character();
+			os << t.cs();
 
 		else if (t.cat() == catBegin) {
 			os << '{';
Index: src/tex2lyx/Parser.cpp
===================================================================
--- src/tex2lyx/Parser.cpp	(revision 27910)
+++ src/tex2lyx/Parser.cpp	(working copy)
@@ -10,10 +10,10 @@
 
 #include <config.h>
 
+#include "Encoding.h"
 #include "Parser.h"
 
 #include <iostream>
-#include <sstream>
 
 using namespace std;
 
@@ -25,6 +25,11 @@ CatCode theCatcode[256];
 
 void catInit()
 {
+	static bool init_done = false;
+	if (init_done) 
+		return;
+	init_done = true;
+
 	fill(theCatcode, theCatcode + 256, catOther);
 	fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
 	fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
@@ -49,13 +54,12 @@ void catInit()
 	theCatcode[int('@')]  = catLetter;
 }
 
-
 /*!
  * Translate a line ending to '\n'.
  * \p c must have catcode catNewline, and it must be the last character read
  * from \p is.
  */
-char getNewline(istream & is, char c)
+char getNewline(idocstream & is, char c)
 {
 	// we have to handle 3 different line endings:
 	// - UNIX (\n)
@@ -63,9 +67,10 @@ char getNewline(istream & is, char c)
 	// - DOS  (\r\n)
 	if (c == '\r') {
 		// MAC or DOS
-		if (is.get(c) && c != '\n') {
+		char_type wc;
+		if (is.get(wc) && wc != '\n') {
 			// MAC
-			is.putback(c);
+			is.putback(wc);
 		}
 		return '\n';
 	}
@@ -73,18 +78,14 @@ char getNewline(istream & is, char c)
 	return c;
 }
 
-}
-
-
-//
-// catcodes
-//
-
-CatCode catcode(unsigned char c)
+CatCode catcode(char_type c)
 {
-	return theCatcode[c];
+	if (c < 256)
+		return theCatcode[(unsigned char)c];
+	return catOther;
 }
 
+}
 
 
 //
@@ -100,18 +101,18 @@ ostream & operator<<(ostream & os, Token
 	else if (t.cat() == catEscape)
 		os << '\\' << t.cs() << ' ';
 	else if (t.cat() == catLetter)
-		os << t.character();
+		os << t.cs();
 	else if (t.cat() == catNewline)
 		os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
 	else
-		os << '[' << t.character() << ',' << t.cat() << ']';
+		os << '[' << t.cs() << ',' << t.cat() << ']';
 	return os;
 }
 
 
 string Token::asString() const
 {
-	return cs_.size() ? cs_ : string(1, char_);
+	return cs_;
 }
 
 
@@ -119,9 +120,9 @@ string Token::asInput() const
 {
 	if (cat_ == catComment)
 		return '%' + cs_ + '\n';
-	if (cat_ == catSpace || cat_ == catNewline)
-		return cs_;
-	return char_ ? string(1, char_) : '\\' + cs_;
+	if (cat_ == catEscape)
+		return '\\' + cs_;
+	return cs_;
 }
 
 
@@ -130,18 +131,30 @@ string Token::asInput() const
 //
 
 
-Parser::Parser(istream & is)
-	: lineno_(0), pos_(0)
+Parser::Parser(idocstream & is)
+	: lineno_(0), pos_(0), iss_(0), is_(is)
 {
-	tokenize(is);
 }
 
 
 Parser::Parser(string const & s)
-	: lineno_(0), pos_(0)
+	: lineno_(0), pos_(0), 
+	  iss_(new idocstringstream(from_utf8(s))), is_(*iss_)
+{
+}
+
+
+Parser::~Parser()
+{
+	delete iss_;
+}
+
+
+void Parser::setEncoding(std::string const & e)
 {
-	istringstream is(s);
-	tokenize(is);
+	Encoding const * enc = encodings.fromLaTeXName(e);
+	cerr << "setting encoding to " << enc->iconvName();
+	is_ << lyx::setEncoding(enc->iconvName());
 }
 
 
@@ -165,7 +178,7 @@ Token const & Parser::curr_token() const
 }
 
 
-Token const & Parser::next_token() const
+Token const & Parser::next_token()
 {
 	static const Token dummy;
 	return good() ? tokens_[pos_] : dummy;
@@ -180,7 +193,7 @@ Token const & Parser::get_token()
 }
 
 
-bool Parser::isParagraph() const
+bool Parser::isParagraph()
 {
 	// A new paragraph in TeX ist started
 	// - either by a newline, following any amount of whitespace
@@ -246,8 +259,11 @@ void Parser::putback()
 }
 
 
-bool Parser::good() const
+bool Parser::good()
 {
+	if (pos_ < tokens_.size())
+		return true;
+	tokenize_one();
 	return pos_ < tokens_.size();
 }
 
@@ -256,7 +272,7 @@ char Parser::getChar()
 {
 	if (!good())
 		error("The input stream is not well...");
-	return tokens_[pos_++].character();
+	return get_token().character();
 }
 
 
@@ -351,86 +367,80 @@ string const Parser::verbatimEnvironment
 }
 
 
-void Parser::tokenize(istream & is)
+void Parser::tokenize_one()
 {
-	static bool init_done = false;
-
-	if (!init_done) {
-		catInit();
-		init_done = true;
+	catInit();
+	char_type c;
+	if (!is_.get(c)) 
+		return;
+
+	switch (catcode(c)) {
+	case catSpace: {
+		docstring s(1, c);
+		while (is_.get(c) && catcode(c) == catSpace)
+			s += c;
+		if (catcode(c) != catSpace)
+			is_.putback(c);
+		push_back(Token(s, catSpace));
+		break;
 	}
-
-	char c;
-	while (is.get(c)) {
-		//cerr << "reading c: " << c << "\n";
-
-		switch (catcode(c)) {
-			case catSpace: {
-				string s(1, c);
-				while (is.get(c) && catcode(c) == catSpace)
-					s += c;
-				if (catcode(c) != catSpace)
-					is.putback(c);
-				push_back(Token(s, catSpace));
-				break;
-			}
-
-			case catNewline: {
-				++lineno_;
-				string s(1, getNewline(is, c));
-				while (is.get(c) && catcode(c) == catNewline) {
-					++lineno_;
-					s += getNewline(is, c);
-				}
-				if (catcode(c) != catNewline)
-					is.putback(c);
-				push_back(Token(s, catNewline));
-				break;
-			}
-
-			case catComment: {
-				// We don't treat "%\n" combinations here specially because
-				// we want to preserve them in the preamble
-				string s;
-				while (is.get(c) && catcode(c) != catNewline)
+		
+	case catNewline: {
+		++lineno_;
+		docstring s(1, getNewline(is_, c));
+		while (is_.get(c) && catcode(c) == catNewline) {
+			++lineno_;
+			s += getNewline(is_, c);
+		}
+		if (catcode(c) != catNewline)
+			is_.putback(c);
+		push_back(Token(s, catNewline));
+		break;
+	}
+		
+	case catComment: {
+		// We don't treat "%\n" combinations here specially because
+		// we want to preserve them in the preamble
+		docstring s;
+		while (is_.get(c) && catcode(c) != catNewline)
+			s += c;
+		// handle possible DOS line ending
+		if (catcode(c) == catNewline)
+			c = getNewline(is_, c);
+		// Note: The '%' at the beginning and the '\n' at the end
+		// of the comment are not stored.
+		++lineno_;
+		push_back(Token(s, catComment));
+		break;
+	}
+		
+	case catEscape: {
+		is_.get(c);
+		if (!is_) {
+			error("unexpected end of input");
+		} else {
+			docstring s(1, c);
+			if (catcode(c) == catLetter) {
+				// collect letters
+				while (is_.get(c) && catcode(c) == catLetter)
 					s += c;
-				// handle possible DOS line ending
-				if (catcode(c) == catNewline)
-					c = getNewline(is, c);
-				// Note: The '%' at the beginning and the '\n' at the end
-				// of the comment are not stored.
-				++lineno_;
-				push_back(Token(s, catComment));
-				break;
+				if (catcode(c) != catLetter)
+					is_.putback(c);
 			}
-
-			case catEscape: {
-				is.get(c);
-				if (!is) {
-					error("unexpected end of input");
-				} else {
-					string s(1, c);
-					if (catcode(c) == catLetter) {
-						// collect letters
-						while (is.get(c) && catcode(c) == catLetter)
-							s += c;
-						if (catcode(c) != catLetter)
-							is.putback(c);
-					}
-					push_back(Token(s, catEscape));
-				}
-				break;
-			}
-
-			case catIgnore: {
-				cerr << "ignoring a char: " << int(c) << "\n";
-				break;
-			}
-
-			default:
-				push_back(Token(c, catcode(c)));
+			push_back(Token(s, catEscape));
 		}
+		break;
+	}
+		
+	case catIgnore: {
+		cerr << "ignoring a char: " << c << "\n";
+		break;
+	}
+		
+	default:
+		push_back(Token(docstring(1, c), catcode(c)));
 	}
+	//cerr << tokens_.back();
 }
 
 
@@ -459,7 +469,7 @@ string Parser::verbatimOption()
 	string res;
 	if (next_token().character() == '[') {
 		Token t = get_token();
-		for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
+		for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
 			if (t.cat() == catBegin) {
 				putback();
 				res += '{' + verbatim_item() + '}';

Reply via email to