Hi, OS = Windows XP (German language) Python = 3.1.2
I need to write a directory listing into a XML file. And after hours of trying and searching i have no clue. My main problem is that the file and folder names can have characters of different languages like German, Turkish, Russian, maybe else. Because Python 3.1 is better with unicode, I decided to use that instead of 2.6 For testing I have created the following files: http://img340.imageshack.us/img340/3461/files.png (google for the words russia, turkish, deutsch, france to find websites with special characters and copy & paste) And this is the code I have now: ############################################ # -*- coding: iso-8859-1 -*- # inspired by: # http://www.dpawson.co.uk/java/dirlist.py # (for Python ~2.4) import sys print ('filesystemencoding: ' + sys.getfilesystemencoding()) print ('defaultencoding: ' + sys.getdefaultencoding()) from pprint import pprint import os.path from stat import * from xml.sax.saxutils import XMLGenerator def recurse_dir(path, writer): for cdir, subdirs, files in os.walk(path): pprint (cdir) writer.startElement('dir', { 'name': cdir }) for f in files: uf = f.encode('utf-8') pprint (uf) attribs = {'name': f} attribs['size'] = str(os.stat(os.path.join(cdir,f))[ST_SIZE]) pprint (attribs) writer.startElement('file', attribs) writer.endElement('file') for subdir in subdirs: recurse_dir(os.path.join(cdir, subdir), writer) writer.endElement('directory') break if __name__ == '__main__': directory = 'c:\\_TEST\\' out = open('C:\\_TEST.xml','w') writer = XMLGenerator(out, 'utf-8') writer.startDocument() recurse_dir(directory, writer) out.close() ############################################ And this is the output: ----------------------------- filesystemencoding: mbcs defaultencoding: utf-8 'c:\\_TEST\\' b'1 English.txt' {'name': '1 English.txt', 'size': '0'} b'2 German \xc3\xa4\xc3\xb6\xc3\xbc\xc3\x9f.txt' {'name': '2 German äöüß.txt', 'size': '0'} b'3 France \xc3\xa1\xc3\xa0\xc3\xa2\xc3\xba\xc3\xb9\xc3\xbb.txt' {'name': '3 France áàâúùû.txt', 'size': '0'} b'4 Russia \xd0\xa0\xd0\xbe\xd1\x81\xd1\x81\xd1\x96\xd0\xb9\xd1\x81\xd0\xba\xd0\xb0\xd1\x8f \xd0\x98\xd0\xbc\xd0\xbf\xd0\xb5\xd1\x80\xd1\x96\xd1\x8f.txt' Traceback (most recent call last): File "test.py", line 36, in <module> recurse_dir(directory, writer) File "test.py", line 22, in recurse_dir pprint (attribs) File "F:\Dev\Python31\lib\pprint.py", line 55, in pprint printer.pprint(object) File "F:\Dev\Python31\lib\pprint.py", line 132, in pprint self._format(object, self._stream, 0, 0, {}, 0) File "F:\Dev\Python31\lib\pprint.py", line 238, in _format write(rep) File "F:\Dev\Python31\lib\encodings\cp850.py", line 19, in encode return codecs.charmap_encode(input,self.errors,encoding_map)[0] UnicodeEncodeError: 'charmap' codec can't encode characters in position 19-28: character maps to <undefined> ----------------------------- I also tried the line: attribs = {'name': uf} with this result: ----------------------------- filesystemencoding: mbcs defaultencoding: utf-8 'c:\\_TEST\\' b'1 English.txt' {'name': b'1 English.txt', 'size': '0'} Traceback (most recent call last): File "test.py", line 36, in <module> recurse_dir(directory, writer) File "test.py", line 23, in recurse_dir writer.startElement('file', attribs) File "F:\Dev\Python31\lib\xml\sax\saxutils.py", line 127, in startElement self._write(' %s=%s' % (name, quoteattr(value))) File "F:\Dev\Python31\lib\xml\sax\saxutils.py", line 68, in quoteattr data = escape(data, entities) File "F:\Dev\Python31\lib\xml\sax\saxutils.py", line 34, in escape data = data.replace("&", "&") TypeError: expected an object with the buffer interface ----------------------------- Maybe this 'data = data.replace("&", "&")' in saxutils is wrong? Maybe there should be a version for "byte datatype"? Thank you! -- http://mail.python.org/mailman/listinfo/python-list