strcasecmp and multibyte encodings

Vladimir Támara Patiño Tue, 09 Apr 2013 10:25:49 -0700

Although the behavior of strcasecmp is unsepecified for multibyteencodings (Is that right?)http://pubs.opengroup.org/onlinepubs/9699919799/


I wish the attached test (encoded in UTF-8) would pass,
so I'm also attaching a patch for strcasecmp and strncasecmp that makes
this test pass, it uses only LC_CTYPE (but not LC_COLLATE) and a

simple numeric comparision when the strings are different (as theprevious version was doing).


--
Dios, gracias por tu amor infinito.

--Vladimir Támara Patiño. http://vtamara.pasosdeJesus.org/

 http://www.pasosdejesus.org/dominio_publico_colombia.html

#include <locale.h>
#include <stdio.h>
#include <string.h>

int main()
{
        char *nl = setlocale(LC_ALL, "es_CO.UTF-8");
        if (strcasecmp("ñ", "Ñ") == 0) {
                printf("OK");
        } else {
                printf("Error");
        }
        
        return 0;
}

--- src53/lib/libc/string/strcasecmp.c  Mon Mar 25 18:28:29 2013
+++ src/lib/libc/string/strcasecmp.c    Tue Apr  9 11:34:44 2013
@@ -30,6 +30,8 @@
  */
 
 #include <string.h>
+#include <wchar.h>
+#include <wctype.h>
 
 typedef unsigned char u_char;
 
@@ -76,29 +78,36 @@
 int
 strcasecmp(const char *s1, const char *s2)
 {
-       const u_char *cm = charmap;
-       const u_char *us1 = (const u_char *)s1;
-       const u_char *us2 = (const u_char *)s2;
-
-       while (cm[*us1] == cm[*us2++])
-               if (*us1++ == '\0')
-                       return (0);
-       return (cm[*us1] - cm[*--us2]);
+       return strncasecmp(s1, s2, strlen(s1) + 1);
 }
 
+/** Uses LC_CTYPE but not LC_COLLATE */
 int
 strncasecmp(const char *s1, const char *s2, size_t n)
 {
+       mbstate_t mb1, mb2;
+       bzero(&mb1, sizeof(mb1));
+       bzero(&mb2, sizeof(mb2));
+       mbsinit(&mb1);
+       mbsinit(&mb2);
+
        if (n != 0) {
-               const u_char *cm = charmap;
                const u_char *us1 = (const u_char *)s1;
                const u_char *us2 = (const u_char *)s2;
+               size_t lus1 = strlen(us1);
+               size_t lus2 = strlen(us2);
 
                do {
-                       if (cm[*us1] != cm[*us2++])
-                               return (cm[*us1] - cm[*--us2]);
-                       if (*us1++ == '\0')
+                       wchar_t w1, w2, l1, l2;
+                       size_t d1 = mbrtowc(&w1, us1, lus1, &mb1);
+                       size_t d2 = mbrtowc(&w2, us2, lus2, &mb2);
+                       if ((l1 = towlower(w1)) != (l2 = towlower(w2))) {
+                               return l1 - l2;
+                       }
+                       if (*us1 == '\0')
                                break;
+                       us2 += d2;
+                       us1 += d1;
                } while (--n != 0);
        }
        return (0);

strcasecmp and multibyte encodings

Reply via email to