Alexander Belopolsky <belopol...@users.sourceforge.net> added the comment:

On Mon, Nov 29, 2010 at 4:41 AM, Marc-Andre Lemburg
<rep...@bugs.python.org> wrote:
..
> It would be better to copy and iterate over the Unicode string first,
> replacing any decimal code points with ASCII ones and then call the
> UTF-8 encoder.
>

Good idea.

> The code as it stands is very inefficient, since it will most likely
> run the memcpy() part for every code point after the first non-ASCII
> decimal one.
>

I doubt there are measurable gains from this optimization, but doing
conversion in Unicode characters results in cleaner API.  The new
patch, issue10557a.diff, implements
_PyUnicode_NormalizeDecimal(Py_UNICODE *s, Py_ssize_t length) which is
defined as follows:

/* Strip leading and trailing space and convert code points that have
decimal
   digit property to the corresponding ASCII digit code point.

   Returns a new Unicode string on success, NULL on failure.
*/

Note that I used deprecated _PyUnicode_AsStringAndSize() in
floatobject.c not only because it is convenient, but also because I
believe that in the future numerical value parsers should be converted
to operate on unicode characters.  When this happens, the use of
_PyUnicode_AsStringAndSize() can be removed.

----------
Added file: http://bugs.python.org/file19872/issue10557a.diff

_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue10557>
_______________________________________
Index: Include/unicodeobject.h
===================================================================
--- Include/unicodeobject.h     (revision 86843)
+++ Include/unicodeobject.h     (working copy)
@@ -1173,6 +1173,17 @@
     const char *errors          /* error handling */
     );
 
+/* Strip leading and trailing space and convert code points that have decimal
+   digit property to the corresponding ASCII digit code point. 
+
+   Returns a new Unicode string on success, NULL on failure.
+*/
+
+PyAPI_FUNC(PyObject*) _PyUnicode_NormalizeDecimal(
+    Py_UNICODE *s,              /* Unicode buffer */
+    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
+    );
+
 /* --- File system encoding ---------------------------------------------- */
 
 /* ParseTuple converter: encode str objects to bytes using
Index: Objects/unicodeobject.c
===================================================================
--- Objects/unicodeobject.c     (revision 86843)
+++ Objects/unicodeobject.c     (working copy)
@@ -6207,6 +6207,40 @@
     return NULL;
 }
 
+PyObject *
+_PyUnicode_NormalizeDecimal(Py_UNICODE *s,
+                            Py_ssize_t length)
+{
+    PyObject *result;
+    Py_UNICODE *p; /* write pointer into result */
+    const Py_UNICODE *end = s + length;
+    Py_ssize_t i;
+    /* Strip whitespace */
+    while (s < end) {
+        if (Py_UNICODE_ISSPACE(*s))
+            s++;
+        else if (Py_UNICODE_ISSPACE(end[-1]))
+            end--;
+        else
+            break;
+    }
+    length = end - s;
+    /* Copy to a new string */
+    result = PyUnicode_FromUnicode(s, length);
+    if (result == NULL)
+        return result;
+    p = PyUnicode_AS_UNICODE(result);
+    /* Iterate over code points */
+    for (i = 0; i < length; i++) {
+        Py_UNICODE ch = p[i];
+        if (!Py_ISDIGIT(ch)) {
+            int decimal = Py_UNICODE_TODECIMAL(ch);
+            if (decimal >= 0)
+                p[i] = '0' + decimal;
+        }
+    }
+    return result;
+}
 /* --- Decimal Encoder ---------------------------------------------------- */
 
 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Index: Objects/floatobject.c
===================================================================
--- Objects/floatobject.c       (revision 86843)
+++ Objects/floatobject.c       (working copy)
@@ -175,52 +175,53 @@
 {
     const char *s, *last, *end;
     double x;
-    char buffer[256]; /* for errors */
-    char *s_buffer = NULL;
+    PyObject *s_buffer = NULL;
     Py_ssize_t len;
     PyObject *result = NULL;
 
     if (PyUnicode_Check(v)) {
-        s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v)+1);
+        s_buffer = _PyUnicode_NormalizeDecimal(PyUnicode_AS_UNICODE(v),
+                                               PyUnicode_GET_SIZE(v));
         if (s_buffer == NULL)
-            return PyErr_NoMemory();
-        if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v),
-                                    PyUnicode_GET_SIZE(v),
-                                    s_buffer,
-                                    NULL))
-            goto error;
-        s = s_buffer;
-        len = strlen(s);
+            return NULL;
+        s = _PyUnicode_AsStringAndSize(s_buffer, &len);
+        if (s == NULL)
+            return NULL;
+        last = s + len;
     }
     else if (PyObject_AsCharBuffer(v, &s, &len)) {
         PyErr_SetString(PyExc_TypeError,
             "float() argument must be a string or a number");
         return NULL;
     }
-    last = s + len;
-
-    while (Py_ISSPACE(*s))
-        s++;
+    else {
+        last = s + len;
+        /* strip space */
+        while (last - s > 0) { 
+            if (Py_ISSPACE(*s))
+                s++;
+            else if (Py_ISSPACE(last[- 1]))
+                last--;
+            else
+                break;
+        }
+    }
     /* We don't care about overflow or underflow.  If the platform
      * supports them, infinities and signed zeroes (on underflow) are
      * fine. */
     x = PyOS_string_to_double(s, (char **)&end, NULL);
-    if (x == -1.0 && PyErr_Occurred())
-        goto error;
-    while (Py_ISSPACE(*end))
-        end++;
-    if (end == last)
-        result = PyFloat_FromDouble(x);
-    else {
-        PyOS_snprintf(buffer, sizeof(buffer),
-                      "invalid literal for float(): %.200s", s);
-        PyErr_SetString(PyExc_ValueError, buffer);
+    if (end != last) {
+        PyErr_Format(PyExc_ValueError,
+                     "could not convert string to float: "
+                     "%.200s", s);
         result = NULL;
     }
-
-  error:
-    if (s_buffer)
-        PyMem_FREE(s_buffer);
+    else if (x == -1.0 && PyErr_Occurred())
+        result = NULL;
+    else
+        result = PyFloat_FromDouble(x);
+    
+    Py_XDECREF(s_buffer);
     return result;
 }
 
_______________________________________________
Python-bugs-list mailing list
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

Reply via email to