Re: wcwidth replacement problems

Alexander V. Lukyanov Tue, 26 Aug 2008 03:49:25 -0700

On Tue, Aug 26, 2008 at 09:32:32AM +0200, Bruno Haible wrote:
> Alexander V. Lukyanov wrote:
> > Let's measure it.
> > 
> > $ time ./wcwidth-solaris 
> > wcwidth(0x2022)=2
> > 
> > real    0m2.205s
> > user    0m2.200s
> > sys     0m0.000s
> > 
> > $ time ./wcwidth-rpl 
> > wcwidth(0x2022)=1
> > 
> > real    0m55.477s
> > user    0m55.350s
> > sys     0m0.000s
> > 
> > $ time ./wcwidth-mk 
> > wcwidth(0x2022)=1
> > 
> > real    0m1.944s
> > user    0m1.940s
> > sys     0m0.010s
> 
> This is not a fair comparison: wcwidth-mk works only in UTF-8 locales,
> whereas wcwidth() from the system and from gnulib return the right result
> in all locales. The test whether the locale encoding is UTF-8 is precisely
> what takes up most time in the gnulib replacement.


Ok. Would you accept this patch, which caches charset properties?
BTW, it also fixes a bug - cjk case could not ever be executed before.

$ time ./wcwidth-rpl1
wcwidth(0x2022)=1

real    0m4.160s
user    0m4.150s
sys     0m0.010s

Worse than Solaris native wcwidth (nl_langinfo+strncmp costs), but still 13
times faster than before.

-- 
   Alexander.

diff --git a/lib/uniwidth.h b/lib/uniwidth.h
index cdc7d96..3979187 100644
--- a/lib/uniwidth.h
+++ b/lib/uniwidth.h
@@ -37,25 +37,7 @@ extern "C" {
 
 /* Determine number of column positions required for UC.  */
 extern int
-       uc_width (ucs4_t uc, const char *encoding);
-
-/* Determine number of column positions required for first N units
-   (or fewer if S ends before this) in S.  */
-extern int
-       u8_width (const uint8_t *s, size_t n, const char *encoding);
-extern int
-       u16_width (const uint16_t *s, size_t n, const char *encoding);
-extern int
-       u32_width (const uint32_t *s, size_t n, const char *encoding);
-
-/* Determine number of column positions required for S.  */
-extern int
-       u8_strwidth (const uint8_t *s, const char *encoding);
-extern int
-       u16_strwidth (const uint16_t *s, const char *encoding);
-extern int
-       u32_strwidth (const uint32_t *s, const char *encoding);
-
+       uc_width (ucs4_t uc, int is_cjk_encoding);
 
 #ifdef __cplusplus
 }
diff --git a/lib/uniwidth/width.c b/lib/uniwidth/width.c
index 4161c26..e43d789 100644
--- a/lib/uniwidth/width.c
+++ b/lib/uniwidth/width.c
@@ -20,8 +20,6 @@
 /* Specification.  */
 #include "uniwidth.h"
 
-#include "cjk.h"
-
 /*
  * Non-spacing attribute table.
  * Consists of:
@@ -267,7 +265,7 @@ static const signed char nonspacing_table_ind[240] = {
 
 /* Determine number of column positions required for UC.  */
 int
-uc_width (ucs4_t uc, const char *encoding)
+uc_width (ucs4_t uc, int is_cjk_encoding)
 {
   /* Test for non-spacing or control character.  */
   if ((uc >> 9) < 240)
@@ -317,7 +315,7 @@ uc_width (ucs4_t uc, const char *encoding)
   /* In ancient CJK encodings, Cyrillic and most other characters are
      double-width as well.  */
   if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
-      && is_cjk_encoding (encoding))
+      && is_cjk_encoding)
     return 2;
   return 1;
 }
diff --git a/lib/wcwidth.c b/lib/wcwidth.c
index 4885071..c2de83b 100644
--- a/lib/wcwidth.c
+++ b/lib/wcwidth.c
@@ -22,22 +22,54 @@
 /* Get iswprint.  */
 #include <wctype.h>
 
+#if HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
+
 #include "localcharset.h"
 #include "streq.h"
 #include "uniwidth.h"
 
 #undef wcwidth
 
+#include "uniwidth/cjk.h"
+
+static char cached_encoding[32];
+static int cached_is_cjk_encoding;
+static int cached_is_utf8_encoding;
+
+static const char *locale_charset_simple ()
+{
+#if HAVE_LANGINFO_CODESET
+  /* Most systems support nl_langinfo (CODESET) nowadays.  */
+  return nl_langinfo (CODESET);
+# else
+  /* Do the complex case */
+  return locale_charset ();
+# endif
+}
+
+static void cache_encoding ()
+{
+  const char *encoding = locale_charset_simple ();
+  if (!strncmp(encoding, cached_encoding, sizeof (cached_encoding)))
+    return;
+  strncpy (cached_encoding, encoding, sizeof (cached_encoding));
+  encoding = locale_charset ();
+  cached_is_utf8_encoding = STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 
0, 0, 0 ,0);
+  cached_is_cjk_encoding = is_cjk_encoding (encoding);
+}
+
 int
 rpl_wcwidth (wchar_t wc)
 {
+  cache_encoding ();
   /* In UTF-8 locales, use a Unicode aware width function.  */
-  const char *encoding = locale_charset ();
-  if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
+  if (cached_is_utf8_encoding || cached_is_cjk_encoding)
     {
       /* We assume that in a UTF-8 locale, a wide character is the same as a
         Unicode character.  */
-      return uc_width (wc, encoding);
+      return uc_width (wc, cached_is_cjk_encoding);
     }
   else
     {

Re: wcwidth replacement problems

Reply via email to