New submission from Filip Salomonsson:

locale.strxfrm currently does not handle non-ascii strings:

$ ./python
Python 3.0a2 (py3k:59482, Dec 13 2007, 21:27:14) 
[GCC 4.1.2 20070626 (Red Hat 4.1.2-14)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import locale
>>> locale.setlocale(locale.LC_COLLATE, "en_US.utf8")
'en_US.utf8'
>>> locale.strxfrm("a")
'\x0c\x01\x08\x01\x02'
>>> locale.strxfrm("\N{LATIN SMALL LETTER A WITH DIAERESIS}")
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: strxfrm() argument 1 must be string without null bytes, not str

The attached patch tries to fix this:

$ ./python
Python 3.0a2 (py3k:59482M, Dec 13 2007, 21:58:09) 
[GCC 4.1.2 20070626 (Red Hat 4.1.2-14)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import locale
>>> locale.setlocale(locale.LC_COLLATE, "en_US.utf8")
'en_US.utf8'
>>> locale.strxfrm("a")
'.\x01\x10\x01\x02'
>>> locale.strxfrm("\N{LATIN SMALL LETTER A WITH DIAERESIS}")
'.\x01\x19\x01\x02'
>>> alist = list("aboåäöABOÅÄÖñÑ")
>>> sorted(alist, cmp=locale.strcoll) == sorted(alist, key=locale.strxfrm)
True


The patch does not include what's needed to define HAVE_WCSXFRM, since I
really don't know how to do that properly (I edited 'configure' and
'pyconfig.h.in' manually to compile it).

----------
files: strxfrm-unicode.diff
messages: 58592
nosy: filips
severity: normal
status: open
title: locale.strxfrm can't handle non-ascii strings
type: behavior
versions: Python 3.0
Added file: http://bugs.python.org/file8946/strxfrm-unicode.diff

__________________________________
Tracker <[EMAIL PROTECTED]>
<http://bugs.python.org/issue1618>
__________________________________
Index: Modules/_localemodule.c
===================================================================
--- Modules/_localemodule.c	(revision 59482)
+++ Modules/_localemodule.c	(working copy)
@@ -250,6 +250,7 @@
 static PyObject*
 PyLocale_strxfrm(PyObject* self, PyObject* args)
 {
+#if !defined(HAVE_WCSXFRM)
     char *s, *buf;
     size_t n1, n2;
     PyObject *result;
@@ -273,6 +274,43 @@
     result = PyUnicode_FromString(buf);
     PyMem_Free(buf);
     return result;
+#else
+    PyObject *s, *result = NULL;
+    wchar_t *buf = NULL, *ws = NULL;
+    int len;
+    size_t len2;
+    if (!PyArg_UnpackTuple(args, "strxfrm", 1, 1, &s))
+        return NULL;
+    /* Argument must be unicode, or it's an error. */
+    if (!PyUnicode_Check(s)) {
+        PyErr_SetString(PyExc_ValueError, "strxfrm arguments must be strings");
+    }
+    /* Convert the unicode string to wchar[]. */
+    len = PyUnicode_GET_SIZE(s) + 1;
+    ws = PyMem_MALLOC(len * sizeof(wchar_t));
+    if (!ws) {
+        PyErr_NoMemory();
+        goto done;
+    }
+    if (PyUnicode_AsWideChar((PyUnicodeObject*)s, ws, len) == -1)
+        goto done;
+    ws[len - 1] = 0;
+
+    /* Get the transformation. */
+    len2 = wcsxfrm(NULL, ws, 0) + 1;
+    buf = PyMem_MALLOC(len2 * sizeof(wchar_t));
+    if (!buf) {
+        PyErr_NoMemory();
+        goto done;
+    }
+    wcsxfrm(buf, ws, len2);
+    result = PyUnicode_FromWideChar(buf, len2 - 1);
+  done:
+    /* Deallocate everything. */
+    if (ws) PyMem_FREE(ws);
+    if (buf) PyMem_FREE(buf);
+    return result;
+#endif
 }
 
 #if defined(MS_WINDOWS)
_______________________________________________
Python-bugs-list mailing list 
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

Reply via email to