Alexandre Vassalotti added the comment:

Christian wrote:
> Alexandre's mangle loop doesn't do the same job as mine. Chars like _
> and - aren't removed from the encoding name and the if clauses don't
> catch for example UTF-8 or ISO-8859-1 only UTF8 or ISO8859-1. 

That isn't true. My mangler does exactly the same thing as your
original one.

However, I forgot to add Py_CHARMASK to the calls of tolower() and
isalnum() which would cause problems on platforms with signed char.

> Also he has overseen a PyString_Check in the code repr function.

Fixed.

> We have to get the codecs up and Py_FileSystemEncoding before we can
> decode the filenames. :( I think that the problem needs much more
> attention and a proper fix.

Maybe adding a global variable, let's say _Py_Codecs_Ready, could be
used to notify PyUnicode_DecodeFSDefault that it can use
PyUnicode_Decode, instead of relying only on the built-in codecs. That
would be much simpler than changing boostrapping process.

Here yet another updated patch. The changes are the following:

   - Add Py_CHARMASK to tolower() and isalnum() calls in
     PyUnicode_DecodeFSDefault().
   - Use PyUnicode_Check(), instead of PyString_Check(), in
     code_repr().
   - Update comments for co_filename and co_name in PyCodeObject
     struct definition.
   - Fix a PyString_AS_STRING(co->co_name) instance in compile.c
   - Replace %S for %U in PyErr_Format() calls for substituting co_name.

One more thing, frozen.c needs to be updated. The module data contains
a code object with a PyString co_name. However, there is a bug in the
imp module (it doesn't detect the encoding from modelines, which cause
io.TextIOWrapper to crash) that prevents me from generating the data
for frozen.c.

__________________________________
Tracker <[EMAIL PROTECTED]>
<http://bugs.python.org/issue1272>
__________________________________
Index: Python/ceval.c
===================================================================
--- Python/ceval.c	(revision 58455)
+++ Python/ceval.c	(working copy)
@@ -767,7 +767,7 @@
 	lltrace = PyDict_GetItemString(f->f_globals, "__lltrace__") != NULL;
 #endif
 #if defined(Py_DEBUG) || defined(LLTRACE)
-	filename = PyString_AsString(co->co_filename);
+	filename = PyUnicode_AsString(co->co_filename);
 #endif
 
 	why = WHY_NOT;
@@ -2565,7 +2565,7 @@
 		if (argcount > co->co_argcount) {
 			if (!(co->co_flags & CO_VARARGS)) {
 				PyErr_Format(PyExc_TypeError,
-				    "%S() takes %s %d "
+				    "%U() takes %s %d "
 				    "%spositional argument%s (%d given)",
 				    co->co_name,
 				    defcount ? "at most" : "exactly",
@@ -2599,7 +2599,7 @@
 			int j;
 			if (keyword == NULL || !PyUnicode_Check(keyword)) {
 				PyErr_Format(PyExc_TypeError,
-				    "%S() keywords must be strings",
+				    "%U() keywords must be strings",
 				    co->co_name);
 				goto fail;
 			}
@@ -2622,7 +2622,7 @@
 			if (j >= co->co_argcount + co->co_kwonlyargcount) {
 				if (kwdict == NULL) {
 					PyErr_Format(PyExc_TypeError,
-					    "%S() got an unexpected "
+					    "%U() got an unexpected "
 					    "keyword argument '%S'",
 					    co->co_name,
 					    keyword);
@@ -2633,7 +2633,7 @@
 			else {
 				if (GETLOCAL(j) != NULL) {
 					PyErr_Format(PyExc_TypeError,
-					     "%S() got multiple "
+					     "%U() got multiple "
 					     "values for keyword "
 					     "argument '%S'",
 					     co->co_name,
@@ -2661,7 +2661,7 @@
 					continue;
 				}
 				PyErr_Format(PyExc_TypeError,
-					"%S() needs keyword-only argument %S",
+					"%U() needs keyword-only argument %S",
 					co->co_name, name);
 				goto fail;
 			}
@@ -2671,7 +2671,7 @@
 			for (i = argcount; i < m; i++) {
 				if (GETLOCAL(i) == NULL) {
 					PyErr_Format(PyExc_TypeError,
-					    "%S() takes %s %d "
+					    "%U() takes %s %d "
 					    "%spositional argument%s "
 					    "(%d given)",
 					    co->co_name,
@@ -2699,7 +2699,7 @@
 	else {
 		if (argcount > 0 || kwcount > 0) {
 			PyErr_Format(PyExc_TypeError,
-				     "%S() takes no arguments (%d given)",
+				     "%U() takes no arguments (%d given)",
 				     co->co_name,
 				     argcount + kwcount);
 			goto fail;
Index: Python/traceback.c
===================================================================
--- Python/traceback.c	(revision 58455)
+++ Python/traceback.c	(working copy)
@@ -229,10 +229,10 @@
 	while (tb != NULL && err == 0) {
 		if (depth <= limit) {
 			err = tb_displayline(f,
-			    PyString_AsString(
+			    PyUnicode_AsString(
 				    tb->tb_frame->f_code->co_filename),
 			    tb->tb_lineno,
-			    PyString_AsString(tb->tb_frame->f_code->co_name));
+			    PyUnicode_AsString(tb->tb_frame->f_code->co_name));
 		}
 		depth--;
 		tb = tb->tb_next;
Index: Python/pythonrun.c
===================================================================
--- Python/pythonrun.c	(revision 58455)
+++ Python/pythonrun.c	(working copy)
@@ -867,7 +867,8 @@
 		return -1;
 	d = PyModule_GetDict(m);
 	if (PyDict_GetItemString(d, "__file__") == NULL) {
-		PyObject *f = PyString_FromString(filename);
+		PyObject *f;
+		f = PyUnicode_DecodeFSDefault(filename);
 		if (f == NULL)
 			return -1;
 		if (PyDict_SetItemString(d, "__file__", f) < 0) {
Index: Python/import.c
===================================================================
--- Python/import.c	(revision 58455)
+++ Python/import.c	(working copy)
@@ -74,10 +74,11 @@
 		      3040 (added signature annotations)
 		      3050 (print becomes a function)
 		      3060 (PEP 3115 metaclass syntax)
-          3070 (PEP 3109 raise changes)
+		      3070 (PEP 3109 raise changes)
+		      3080 (PEP 3137 make __file__ and __name__ unicode)
 .
 */
-#define MAGIC (3070 | ((long)'\r'<<16) | ((long)'\n'<<24))
+#define MAGIC (3080 | ((long)'\r'<<16) | ((long)'\n'<<24))
 
 /* Magic word as global; note that _PyImport_Init() can change the
    value of this global to accommodate for alterations of how the
@@ -652,7 +653,7 @@
 	/* Remember the filename as the __file__ attribute */
 	v = NULL;
 	if (pathname != NULL) {
-		v = PyString_FromString(pathname);
+		v = PyUnicode_DecodeFSDefault(pathname);
 		if (v == NULL)
 			PyErr_Clear();
 	}
@@ -983,7 +984,7 @@
 		PySys_WriteStderr("import %s # directory %s\n",
 			name, pathname);
 	d = PyModule_GetDict(m);
-	file = PyString_FromString(pathname);
+	file = PyUnicode_DecodeFSDefault(pathname);
 	if (file == NULL)
 		goto error;
 	path = Py_BuildValue("[O]", file);
Index: Python/compile.c
===================================================================
--- Python/compile.c	(revision 58455)
+++ Python/compile.c	(working copy)
@@ -1247,7 +1247,7 @@
 				PyObject_REPR(name), 
 				PyString_AS_STRING(c->u->u_name), 
 				reftype, arg,
-				PyString_AS_STRING(co->co_name),
+				PyUnicode_AsString(co->co_name),
 				PyObject_REPR(co->co_freevars));
 			Py_FatalError("compiler_make_closure()");
 		}
@@ -4001,7 +4001,7 @@
 	freevars = dict_keys_inorder(c->u->u_freevars, PyTuple_Size(cellvars));
 	if (!freevars)
 	    goto error;
-	filename = PyString_FromString(c->c_filename);
+	filename = PyUnicode_DecodeFSDefault(c->c_filename);
 	if (!filename)
 		goto error;
 
Index: Python/importdl.c
===================================================================
--- Python/importdl.c	(revision 58455)
+++ Python/importdl.c	(working copy)
@@ -62,7 +62,9 @@
 		return NULL;
 	}
 	/* Remember the filename as the __file__ attribute */
-	if (PyModule_AddStringConstant(m, "__file__", pathname) < 0)
+	PyObject *path;
+	path = PyUnicode_DecodeFSDefault(pathname);
+	if (PyModule_AddObject(m, "__file__", path) < 0)
 		PyErr_Clear(); /* Not important enough to report */
 
 	if (_PyImport_FixupExtension(name, pathname) == NULL)
Index: Include/code.h
===================================================================
--- Include/code.h	(revision 58455)
+++ Include/code.h	(working copy)
@@ -21,8 +21,8 @@
     PyObject *co_freevars;	/* tuple of strings (free variable names) */
     PyObject *co_cellvars;      /* tuple of strings (cell variable names) */
     /* The rest doesn't count for hash/cmp */
-    PyObject *co_filename;	/* string (where it was loaded from) */
-    PyObject *co_name;		/* string (name, for reference) */
+    PyObject *co_filename;	/* unicode (where it was loaded from) */
+    PyObject *co_name;		/* unicode (name, for reference) */
     int co_firstlineno;		/* first source line number */
     PyObject *co_lnotab;	/* string (encoding addr<->lineno mapping) */
     void *co_zombieframe;     /* for optimization only (see frameobject.c) */
Index: Include/unicodeobject.h
===================================================================
--- Include/unicodeobject.h	(revision 58455)
+++ Include/unicodeobject.h	(working copy)
@@ -154,6 +154,7 @@
 # define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
 # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
 # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
+# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
 # define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
 # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
@@ -245,6 +246,7 @@
 # define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
 # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
 # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
+# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
 # define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
 # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
@@ -641,6 +643,20 @@
 PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
     PyObject *, const char *);
 
+/* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
+
+   If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
+   UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
+   invalid characters with '?'.
+
+   The function is intended to be used for paths and file names only
+   during bootstrapping process where the codecs are not set up.
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
+    const char *s               /* encoded string */
+    );
+
 /* Return a char* holding the UTF-8 encoded value of the
    Unicode object.
 
Index: Objects/codeobject.c
===================================================================
--- Objects/codeobject.c	(revision 58455)
+++ Objects/codeobject.c	(working copy)
@@ -50,6 +50,7 @@
 {
 	PyCodeObject *co;
 	Py_ssize_t i;
+
 	/* Check argument types */
 	if (argcount < 0 || nlocals < 0 ||
 	    code == NULL ||
@@ -58,20 +59,16 @@
 	    varnames == NULL || !PyTuple_Check(varnames) ||
 	    freevars == NULL || !PyTuple_Check(freevars) ||
 	    cellvars == NULL || !PyTuple_Check(cellvars) ||
-	    name == NULL || (!PyString_Check(name) && !PyUnicode_Check(name)) ||
-	    filename == NULL || !PyString_Check(filename) ||
+	    name == NULL || !PyUnicode_Check(name) ||
+	    filename == NULL || !PyUnicode_Check(filename) ||
 	    lnotab == NULL || !PyString_Check(lnotab) ||
 	    !PyObject_CheckReadBuffer(code)) {
 		PyErr_BadInternalCall();
 		return NULL;
 	}
-	if (PyString_Check(name)) {
-		name = PyUnicode_FromString(PyString_AS_STRING(name));
-		if (name == NULL)
-			return NULL;
-	} else {
-		Py_INCREF(name);
-	}
+	Py_INCREF(name);
+	Py_INCREF(filename);
+
 	intern_strings(names);
 	intern_strings(varnames);
 	intern_strings(freevars);
@@ -299,8 +296,8 @@
 
 	if (co->co_firstlineno != 0)
 		lineno = co->co_firstlineno;
-	if (co->co_filename && PyString_Check(co->co_filename))
-		filename = PyString_AS_STRING(co->co_filename);
+	if (co->co_filename && PyUnicode_Check(co->co_filename))
+		filename = PyUnicode_AsString(co->co_filename);
 	return PyUnicode_FromFormat(
 	                "<code object %.100U at %p, file \"%.300s\", line %d>",
 	                co->co_name, co, filename, lineno);
Index: Objects/unicodeobject.c
===================================================================
--- Objects/unicodeobject.c	(revision 58455)
+++ Objects/unicodeobject.c	(working copy)
@@ -1231,6 +1231,57 @@
     return v;
 }
 
+PyObject*
+PyUnicode_DecodeFSDefault(const char *s)
+{
+    PyObject *v = NULL;
+    Py_ssize_t size = (Py_ssize_t)strlen(s);
+    const char *encoding;
+    enum { N = 16 };
+    char mangled[N];
+
+    /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
+       can be undefined. If it is case, decode using UTF-8. */
+    if (Py_FileSystemDefaultEncoding) {
+        encoding = Py_FileSystemDefaultEncoding;
+    }
+    else {
+        return PyUnicode_DecodeUTF8(s, size, "replace");
+    }
+
+    /* Py_FileSystemDefaultEncoding is not guarantee to be normalized.
+       So, lower the string and remove any non-alphanumeric characters. */
+    char *p = mangled;
+    while (*encoding && (p - mangled) < (N - 1)) {
+        if (isalnum(Py_CHARMASK(*encoding))) {
+            *p++ = tolower(Py_CHARMASK(*encoding));
+        }
+        encoding++;
+    }
+    *p = '\0';
+
+    if (strcmp(mangled, "utf8") == 0)
+        v = PyUnicode_DecodeUTF8(s, size, NULL);
+    else if (strcmp(mangled, "utf16") == 0)
+        v = PyUnicode_DecodeUTF16(s, size, NULL, 0);
+    else if (strcmp(mangled, "utf32") == 0)
+        v = PyUnicode_DecodeUTF32(s, size, NULL, 0);
+    else if ((strcmp(mangled, "latin1") == 0)
+             || (strcmp(mangled, "iso8859-1") == 0))
+        v = PyUnicode_DecodeLatin1(s, size, NULL);
+    else if (strcmp(mangled, "ascii") == 0)
+        v = PyUnicode_DecodeASCII(s, size, NULL);
+#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
+    else if (strcmp(mangled, "mbcs") == 0)
+        v = PyUnicode_DecodeMBCS(s, size, NULL);
+#endif
+
+    if (v == NULL)
+        v = PyUnicode_DecodeUTF8(s, size, "replace");
+
+    return v;
+}
+
 char*
 PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
 {
Index: Objects/moduleobject.c
===================================================================
--- Objects/moduleobject.c	(revision 58455)
+++ Objects/moduleobject.c	(working copy)
@@ -86,12 +86,12 @@
 	d = ((PyModuleObject *)m)->md_dict;
 	if (d == NULL ||
 	    (fileobj = PyDict_GetItemString(d, "__file__")) == NULL ||
-	    !PyString_Check(fileobj))
+	    !PyUnicode_Check(fileobj))
 	{
 		PyErr_SetString(PyExc_SystemError, "module filename missing");
 		return NULL;
 	}
-	return PyString_AsString(fileobj);
+	return PyUnicode_AsString(fileobj);
 }
 
 void
Index: Modules/_ctypes/callbacks.c
===================================================================
--- Modules/_ctypes/callbacks.c	(revision 58455)
+++ Modules/_ctypes/callbacks.c	(working copy)
@@ -34,9 +34,9 @@
 	PyCodeObject *py_code = 0;
 	PyFrameObject *py_frame = 0;
     
-	py_srcfile = PyString_FromString(filename);
+	py_srcfile = PyUnicode_DecodeFSDefault(filename);
 	if (!py_srcfile) goto bad;
-	py_funcname = PyString_FromString(funcname);
+	py_funcname = PyUnicode_FromString(funcname);
 	if (!py_funcname) goto bad;
 	py_globals = PyDict_New();
 	if (!py_globals) goto bad;
Index: Modules/pyexpat.c
===================================================================
--- Modules/pyexpat.c	(revision 58455)
+++ Modules/pyexpat.c	(working copy)
@@ -232,13 +232,13 @@
         code = PyString_FromString("");
         if (code == NULL)
             goto failed;
-        name = PyString_FromString(func_name);
+        name = PyUnicode_FromString(func_name);
         if (name == NULL)
             goto failed;
         nulltuple = PyTuple_New(0);
         if (nulltuple == NULL)
             goto failed;
-        filename = PyString_FromString(__FILE__);
+        filename = PyUnicode_DecodeFSDefault(__FILE__);
         handler_info[slot].tb_code =
             PyCode_New(0,		/* argcount */
                        0,       /* kwonlyargcount */
_______________________________________________
Python-bugs-list mailing list 
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

Reply via email to