Alexander Belopolsky <belopol...@users.sourceforge.net> added the comment:
On Mon, Nov 29, 2010 at 4:41 AM, Marc-Andre Lemburg
<rep...@bugs.python.org> wrote:
..
> It would be better to copy and iterate over the Unicode string first,
> replacing any decimal code points with ASCII ones and then call the
> UTF-8 encoder.
>
Good idea.
> The code as it stands is very inefficient, since it will most likely
> run the memcpy() part for every code point after the first non-ASCII
> decimal one.
>
I doubt there are measurable gains from this optimization, but doing
conversion in Unicode characters results in cleaner API. The new
patch, issue10557a.diff, implements
_PyUnicode_NormalizeDecimal(Py_UNICODE *s, Py_ssize_t length) which is
defined as follows:
/* Strip leading and trailing space and convert code points that have
decimal
digit property to the corresponding ASCII digit code point.
Returns a new Unicode string on success, NULL on failure.
*/
Note that I used deprecated _PyUnicode_AsStringAndSize() in
floatobject.c not only because it is convenient, but also because I
believe that in the future numerical value parsers should be converted
to operate on unicode characters. When this happens, the use of
_PyUnicode_AsStringAndSize() can be removed.
----------
Added file: http://bugs.python.org/file19872/issue10557a.diff
_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue10557>
_______________________________________
Index: Include/unicodeobject.h
===================================================================
--- Include/unicodeobject.h (revision 86843)
+++ Include/unicodeobject.h (working copy)
@@ -1173,6 +1173,17 @@
const char *errors /* error handling */
);
+/* Strip leading and trailing space and convert code points that have decimal
+ digit property to the corresponding ASCII digit code point.
+
+ Returns a new Unicode string on success, NULL on failure.
+*/
+
+PyAPI_FUNC(PyObject*) _PyUnicode_NormalizeDecimal(
+ Py_UNICODE *s, /* Unicode buffer */
+ Py_ssize_t length /* Number of Py_UNICODE chars to encode */
+ );
+
/* --- File system encoding ---------------------------------------------- */
/* ParseTuple converter: encode str objects to bytes using
Index: Objects/unicodeobject.c
===================================================================
--- Objects/unicodeobject.c (revision 86843)
+++ Objects/unicodeobject.c (working copy)
@@ -6207,6 +6207,40 @@
return NULL;
}
+PyObject *
+_PyUnicode_NormalizeDecimal(Py_UNICODE *s,
+ Py_ssize_t length)
+{
+ PyObject *result;
+ Py_UNICODE *p; /* write pointer into result */
+ const Py_UNICODE *end = s + length;
+ Py_ssize_t i;
+ /* Strip whitespace */
+ while (s < end) {
+ if (Py_UNICODE_ISSPACE(*s))
+ s++;
+ else if (Py_UNICODE_ISSPACE(end[-1]))
+ end--;
+ else
+ break;
+ }
+ length = end - s;
+ /* Copy to a new string */
+ result = PyUnicode_FromUnicode(s, length);
+ if (result == NULL)
+ return result;
+ p = PyUnicode_AS_UNICODE(result);
+ /* Iterate over code points */
+ for (i = 0; i < length; i++) {
+ Py_UNICODE ch = p[i];
+ if (!Py_ISDIGIT(ch)) {
+ int decimal = Py_UNICODE_TODECIMAL(ch);
+ if (decimal >= 0)
+ p[i] = '0' + decimal;
+ }
+ }
+ return result;
+}
/* --- Decimal Encoder ---------------------------------------------------- */
int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Index: Objects/floatobject.c
===================================================================
--- Objects/floatobject.c (revision 86843)
+++ Objects/floatobject.c (working copy)
@@ -175,52 +175,53 @@
{
const char *s, *last, *end;
double x;
- char buffer[256]; /* for errors */
- char *s_buffer = NULL;
+ PyObject *s_buffer = NULL;
Py_ssize_t len;
PyObject *result = NULL;
if (PyUnicode_Check(v)) {
- s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v)+1);
+ s_buffer = _PyUnicode_NormalizeDecimal(PyUnicode_AS_UNICODE(v),
+ PyUnicode_GET_SIZE(v));
if (s_buffer == NULL)
- return PyErr_NoMemory();
- if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v),
- PyUnicode_GET_SIZE(v),
- s_buffer,
- NULL))
- goto error;
- s = s_buffer;
- len = strlen(s);
+ return NULL;
+ s = _PyUnicode_AsStringAndSize(s_buffer, &len);
+ if (s == NULL)
+ return NULL;
+ last = s + len;
}
else if (PyObject_AsCharBuffer(v, &s, &len)) {
PyErr_SetString(PyExc_TypeError,
"float() argument must be a string or a number");
return NULL;
}
- last = s + len;
-
- while (Py_ISSPACE(*s))
- s++;
+ else {
+ last = s + len;
+ /* strip space */
+ while (last - s > 0) {
+ if (Py_ISSPACE(*s))
+ s++;
+ else if (Py_ISSPACE(last[- 1]))
+ last--;
+ else
+ break;
+ }
+ }
/* We don't care about overflow or underflow. If the platform
* supports them, infinities and signed zeroes (on underflow) are
* fine. */
x = PyOS_string_to_double(s, (char **)&end, NULL);
- if (x == -1.0 && PyErr_Occurred())
- goto error;
- while (Py_ISSPACE(*end))
- end++;
- if (end == last)
- result = PyFloat_FromDouble(x);
- else {
- PyOS_snprintf(buffer, sizeof(buffer),
- "invalid literal for float(): %.200s", s);
- PyErr_SetString(PyExc_ValueError, buffer);
+ if (end != last) {
+ PyErr_Format(PyExc_ValueError,
+ "could not convert string to float: "
+ "%.200s", s);
result = NULL;
}
-
- error:
- if (s_buffer)
- PyMem_FREE(s_buffer);
+ else if (x == -1.0 && PyErr_Occurred())
+ result = NULL;
+ else
+ result = PyFloat_FromDouble(x);
+
+ Py_XDECREF(s_buffer);
return result;
}
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com