New submission from Filip Salomonsson:
locale.strxfrm currently does not handle non-ascii strings:
$ ./python
Python 3.0a2 (py3k:59482, Dec 13 2007, 21:27:14)
[GCC 4.1.2 20070626 (Red Hat 4.1.2-14)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import locale
>>> locale.setlocale(locale.LC_COLLATE, "en_US.utf8")
'en_US.utf8'
>>> locale.strxfrm("a")
'\x0c\x01\x08\x01\x02'
>>> locale.strxfrm("\N{LATIN SMALL LETTER A WITH DIAERESIS}")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: strxfrm() argument 1 must be string without null bytes, not str
The attached patch tries to fix this:
$ ./python
Python 3.0a2 (py3k:59482M, Dec 13 2007, 21:58:09)
[GCC 4.1.2 20070626 (Red Hat 4.1.2-14)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import locale
>>> locale.setlocale(locale.LC_COLLATE, "en_US.utf8")
'en_US.utf8'
>>> locale.strxfrm("a")
'.\x01\x10\x01\x02'
>>> locale.strxfrm("\N{LATIN SMALL LETTER A WITH DIAERESIS}")
'.\x01\x19\x01\x02'
>>> alist = list("aboåäöABOÅÄÖñÑ")
>>> sorted(alist, cmp=locale.strcoll) == sorted(alist, key=locale.strxfrm)
True
The patch does not include what's needed to define HAVE_WCSXFRM, since I
really don't know how to do that properly (I edited 'configure' and
'pyconfig.h.in' manually to compile it).
----------
files: strxfrm-unicode.diff
messages: 58592
nosy: filips
severity: normal
status: open
title: locale.strxfrm can't handle non-ascii strings
type: behavior
versions: Python 3.0
Added file: http://bugs.python.org/file8946/strxfrm-unicode.diff
__________________________________
Tracker <[EMAIL PROTECTED]>
<http://bugs.python.org/issue1618>
__________________________________
Index: Modules/_localemodule.c
===================================================================
--- Modules/_localemodule.c (revision 59482)
+++ Modules/_localemodule.c (working copy)
@@ -250,6 +250,7 @@
static PyObject*
PyLocale_strxfrm(PyObject* self, PyObject* args)
{
+#if !defined(HAVE_WCSXFRM)
char *s, *buf;
size_t n1, n2;
PyObject *result;
@@ -273,6 +274,43 @@
result = PyUnicode_FromString(buf);
PyMem_Free(buf);
return result;
+#else
+ PyObject *s, *result = NULL;
+ wchar_t *buf = NULL, *ws = NULL;
+ int len;
+ size_t len2;
+ if (!PyArg_UnpackTuple(args, "strxfrm", 1, 1, &s))
+ return NULL;
+ /* Argument must be unicode, or it's an error. */
+ if (!PyUnicode_Check(s)) {
+ PyErr_SetString(PyExc_ValueError, "strxfrm arguments must be strings");
+ }
+ /* Convert the unicode string to wchar[]. */
+ len = PyUnicode_GET_SIZE(s) + 1;
+ ws = PyMem_MALLOC(len * sizeof(wchar_t));
+ if (!ws) {
+ PyErr_NoMemory();
+ goto done;
+ }
+ if (PyUnicode_AsWideChar((PyUnicodeObject*)s, ws, len) == -1)
+ goto done;
+ ws[len - 1] = 0;
+
+ /* Get the transformation. */
+ len2 = wcsxfrm(NULL, ws, 0) + 1;
+ buf = PyMem_MALLOC(len2 * sizeof(wchar_t));
+ if (!buf) {
+ PyErr_NoMemory();
+ goto done;
+ }
+ wcsxfrm(buf, ws, len2);
+ result = PyUnicode_FromWideChar(buf, len2 - 1);
+ done:
+ /* Deallocate everything. */
+ if (ws) PyMem_FREE(ws);
+ if (buf) PyMem_FREE(buf);
+ return result;
+#endif
}
#if defined(MS_WINDOWS)
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com