Re: 3 important bugs right now - more testing needed!

Georg Baum Sun, 12 Nov 2006 10:55:00 -0800

Am Donnerstag, 9. November 2006 14:53 schrieb Jean-Marc Lasgouttes:
> >>>>> "Jean-Pierre" == Jean-Pierre Chrétien <[EMAIL PROTECTED]> writes:
> 
> Jean-Pierre> Could the diacritics problem come from the list of
> Jean-Pierre> locales not including utf8 ?
> 
> What is the diacritics problem? That we cannot check accented
> characters? We should find a way to read unicode table and decide what
> is the class of the different characters (to replace the code in
> support/textutils.h). I think this is a prerequisite for 1.5.


I agree.

> Is this
>  http://crl.nmsu.edu/~mleisher/ucdata.html
> something we could use?

Maybe.

> Or do we have a way to coerce the c library to 
>  give us the information we need without switching locales?

The C library is only helpful if wchar_t is 4 byte wide. Otherwise it has 
no useful information about UCS4 strings.

> I see here that boost.Regex is able to use character properties:
> http://www.boost.org/libs/regex/doc/character_class_names.html
> 
> Does that mean that we should also be able to access them with our
> current code?

No. They use the wchar_t stuff from the C library for wide strings 
internally. If that is usable (e.g. on linux) we can as well use it 
directly.

The attached patch does that. I am very tempted to apply it and have the 
windows people look for a solution for their platform. If such a solution 
is found we can still decide whether we use it always or only on windows.

Unless somebody has very good reasons to not use the C library support for 
wchar_t when it is usable I am going to put this in.

The proper C++ way would be to implement a full UCS4 ctype facet and use 
that instead of the stuff in textutils.h and lstrings.h, but I don't feel 
like doing this now.


Georg

Index: src/buffer.C
===================================================================
--- src/buffer.C	(Revision 15887)
+++ src/buffer.C	(Arbeitskopie)
@@ -554,11 +554,9 @@ void Buffer::insertStringAsLines(Paragra
 				}
 				space_inserted = true;
 			}
-/* FIXME: not needed anymore?
 		} else if (!isPrintable(*cit)) {
 			// Ignore unprintables
 			continue;
-*/
 		} else {
 			// just insert the character
 			par.insertChar(pos, *cit, font, params().trackChanges);
Index: src/support/lstrings.C
===================================================================
--- src/support/lstrings.C	(Revision 15887)
+++ src/support/lstrings.C	(Arbeitskopie)
@@ -32,6 +32,16 @@
 #include <algorithm>
 #include <sstream>
 
+#if defined(HAVE_WCHAR_T) && SIZEOF_WCHAR_T == 4
+// All implementations that have a 4byte wchar_t use UCS4 as encoding, so we
+// can simply use the wchar_t C library functions
+#include <wctype.h>
+#else
+// Steal some code from somewhere else, e.g. glib (look at gunicode.h)
+// The code that we currently use does not really work.
+#endif
+
+
 using lyx::docstring;
 
 using std::transform;
@@ -76,8 +86,8 @@ int compare_no_case(docstring const & s,
 	docstring::const_iterator p2 = s2.begin();
 
 	while (p != s.end() && p2 != s2.end()) {
-		int const lc1 = tolower(*p);
-		int const lc2 = tolower(*p2);
+		char_type const lc1 = lowercase(*p);
+		char_type const lc2 = lowercase(*p2);
 		if (lc1 != lc2)
 			return (lc1 < lc2) ? -1 : 1;
 		++p;
@@ -94,7 +104,7 @@ int compare_no_case(docstring const & s,
 
 namespace {
 
-int ascii_tolower(int c) {
+char_type ascii_tolower(char_type c) {
 	if (c >= 'A' && c <= 'Z')
 		return c - 'A' + 'a';
 	return c;
@@ -108,8 +118,8 @@ int do_compare_ascii_no_case(String cons
 	typename String::const_iterator p2 = s2.begin();
 
 	while (p != s.end() && p2 != s2.end()) {
-		int const lc1 = ascii_tolower(*p);
-		int const lc2 = ascii_tolower(*p2);
+		char_type const lc1 = ascii_tolower(*p);
+		char_type const lc2 = ascii_tolower(*p2);
 		if (lc1 != lc2)
 			return (lc1 < lc2) ? -1 : 1;
 		++p;
@@ -300,7 +310,9 @@ char uppercase(char c)
 	return char(toupper(c));
 }
 
-// FIXME for lowercase() and uppercase() function below:
+
+// FIXME UNICODE
+// for lowercase() and uppercase() function below when wchar_t is not used:
 // 1) std::tolower() and std::toupper() are templates that
 // compile fine with char_type. With the test (c >= 256) we
 // do not trust these function to do the right thing with
@@ -310,19 +322,27 @@ char uppercase(char c)
 
 char_type lowercase(char_type c)
 {
+#if defined(HAVE_WCHAR_T) && SIZEOF_WCHAR_T == 4
+	return towlower(c);
+#else
 	if (c >= 256)
 		return c;
 
 	return tolower(c);
+#endif
 }
 
 
 char_type uppercase(char_type c)
 {
+#if defined(HAVE_WCHAR_T) && SIZEOF_WCHAR_T == 4
+	return towupper(c);
+#else
 	if (c >= 256)
 		return c;
 
 	return toupper(c);
+#endif
 }
 
 
Index: src/support/textutils.h
===================================================================
--- src/support/textutils.h	(Revision 15887)
+++ src/support/textutils.h	(Arbeitskopie)
@@ -17,12 +17,21 @@
 
 #include "support/types.h"
 
+#if defined(HAVE_WCHAR_T) && SIZEOF_WCHAR_T == 4
+// All implementations that have a 4byte wchar_t use UCS4 as encoding, so we
+// can simply use the wchar_t C library functions
+#include <wctype.h>
+#else
+// Steal some code from somewhere else, e.g. glib (look at gunicode.h)
+// The code that we currently use does not really work.
+#endif
+
 
 namespace lyx {
 
 /// return true if the char is a line separator
 inline
-bool isLineSeparatorChar(lyx::char_type c)
+bool isLineSeparatorChar(char_type c)
 {
 	return c == ' ';
 }
@@ -30,34 +39,55 @@ bool isLineSeparatorChar(lyx::char_type 
 
 /// return true if a char is alphabetical (including accented chars)
 inline
-bool isLetterChar(lyx::char_type c)
+bool isLetterChar(char_type c)
 {
+#if defined(HAVE_WCHAR_T) && SIZEOF_WCHAR_T == 4
+	return iswalpha(c);
+#else
+	// FIXME UNICODE This is wrong!
 	return (c >= 'A' && c <= 'Z')
 		|| (c >= 'a' && c <= 'z')
 		|| (c >= 192 && c < 256); // in iso-8859-x these are accented chars
+#endif
 }
 
 
-/// return true if the char is printable (masked to 7-bit ASCII)
+/// return true if the char is printable
 inline
-bool isPrintable(lyx::char_type c)
+bool isPrintable(char_type c)
 {
+#if defined(HAVE_WCHAR_T) && SIZEOF_WCHAR_T == 4
+	return iswprint(c);
+#else
+	// FIXME UNICODE This is wrong!
 	return (c & 127) >= ' ';
+#endif
 }
 
 
-/// return true if the char is printable and not a space (masked to 7-bit ASCII)
+/// return true if the char is printable and not a space
 inline
-bool isPrintableNonspace(lyx::char_type c)
+bool isPrintableNonspace(char_type c)
 {
+#if defined(HAVE_WCHAR_T) && SIZEOF_WCHAR_T == 4
+	return iswprint(c) && !iswspace(c);
+#else
+	// FIXME UNICODE This is wrong!
 	return (c & 127) > ' ';
+#endif
 }
 
+
 /// return true if a unicode char is a digit.
 inline
-bool isDigit(lyx::char_type ch)
+bool isDigit(char_type c)
 {
-	return ch >= '0' && ch <= '9';
+#if defined(HAVE_WCHAR_T) && SIZEOF_WCHAR_T == 4
+	return iswdigit(c);
+#else
+	// FIXME UNICODE This is wrong!
+	return c >= '0' && c <= '9';
+#endif
 }
 
 
Index: src/paragraph.C
===================================================================
--- src/paragraph.C	(Revision 15890)
+++ src/paragraph.C	(Arbeitskopie)
@@ -1392,7 +1392,6 @@ docstring const Paragraph::asString(Buff
 
 	for (pos_type i = beg; i < end; ++i) {
 		value_type const c = getUChar(buffer.params(), i);
-		// FIXME: isPrintable does not work for lyx::char_type
 		if (isPrintable(c))
 			os.put(c);
 		else if (c == META_INSET)
@@ -1570,6 +1569,7 @@ char_type Paragraph::transformChar(char_
 {
 	if (!Encodings::is_arabic(c))
 		if (lyxrc.font_norm_type == LyXRC::ISO_8859_6_8 && isDigit(c))
+			// FIXME UNICODE What does this do?
 			return c + (0xb0 - '0');
 		else
 			return c;

Re: 3 important bugs right now - more testing needed!

Reply via email to