Colin Walters added the comment:
Patch to add Unicode support.
Note: this patch recodes shlex.py from iso-8859-1 to utf-8, so it has
mixed encodings.
----------
nosy: +cgwalters
Added file: http://bugs.python.org/file9025/shlex-unicode.patch
__________________________________
Tracker <[EMAIL PROTECTED]>
<http://bugs.python.org/issue1170>
__________________________________
--- /usr/lib64/python2.5/shlex.py 2007-10-30 13:45:31.000000000 -0400
+++ hotwire/shlex.py 2007-12-22 13:05:52.000000000 -0500
@@ -1,4 +1,4 @@
-# -*- coding: iso-8859-1 -*-
+# -*- coding: utf-8 -*-
"""A lexical analyzer class for simple shell-like syntaxes."""
# Module and documentation by Eric S. Raymond, 21 Dec 1998
@@ -6,9 +6,11 @@
# push_source() and pop_source() made explicit by ESR, January 2001.
# Posix compliance, split(), string arguments, and
# iterator interface by Gustavo Niemeyer, April 2003.
+# Modified to support Unicode by Colin Walters, Dec 2007
import os.path
import sys
+import unicodedata
from collections import deque
try:
@@ -20,7 +22,7 @@
class shlex:
"A lexical analyzer class for simple shell-like syntaxes."
- def __init__(self, instream=None, infile=None, posix=False):
+ def __init__(self, instream=None, infile=None, posix=False, utf=True):
if isinstance(instream, basestring):
instream = StringIO(instream)
if instream is not None:
@@ -34,13 +36,21 @@
self.eof = None
else:
self.eof = ''
+ self.utf = utf
self.commenters = '#'
self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
- if self.posix:
- self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
- 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
- self.whitespace = ' \t\r\n'
+ if self.posix and not self.utf:
+ self.wordchars += ('Ãà áâãäåæçèéêëìÃîïðñòóôõöøùúûüýþÿ'
+ 'ÃÃÃÃÃÃ
ÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃ')
+ elif self.posix:
+ # We dynamically determine character classes below, except
+ # by default _ is a word character
+ self.wordchars = '_'
+ if not self.utf:
+ self.whitespace = ' \t\r\n'
+ else:
+ self.whitespace = ''
self.whitespace_split = False
self.quotes = '\'"'
self.escape = '\\'
@@ -116,12 +126,22 @@
else:
print "shlex: token=EOF"
return raw
+
+ def __is_whitespace(self, c, category):
+ return c in self.whitespace or (self.utf and category[0] == 'Z')
+
+ def __is_wordchar(self, c, category):
+ return c in self.wordchars or (self.utf and category[0] in ('L', 'N'))
def read_token(self):
quoted = False
escapedstate = ' '
while True:
nextchar = self.instream.read(1)
+ if nextchar and self.utf:
+ nextcategory = unicodedata.category(nextchar)
+ else:
+ nextcategory = None
if nextchar == '\n':
self.lineno = self.lineno + 1
if self.debug >= 3:
@@ -134,7 +154,7 @@
if not nextchar:
self.state = None # end of file
break
- elif nextchar in self.whitespace:
+ if self.__is_whitespace(nextchar, nextcategory):
if self.debug >= 2:
print "shlex: I see whitespace in whitespace state"
if self.token or (self.posix and quoted):
@@ -147,7 +167,7 @@
elif self.posix and nextchar in self.escape:
escapedstate = 'a'
self.state = nextchar
- elif nextchar in self.wordchars:
+ elif self.__is_wordchar(nextchar, nextcategory):
self.token = nextchar
self.state = 'a'
elif nextchar in self.quotes:
@@ -199,8 +219,8 @@
elif self.state == 'a':
if not nextchar:
self.state = None # end of file
- break
- elif nextchar in self.whitespace:
+ break
+ if self.__is_whitespace(nextchar, nextcategory):
if self.debug >= 2:
print "shlex: I see whitespace in word state"
self.state = ' '
@@ -222,7 +242,7 @@
elif self.posix and nextchar in self.escape:
escapedstate = 'a'
self.state = nextchar
- elif nextchar in self.wordchars or nextchar in self.quotes \
+ elif self.__is_wordchar(nextchar, nextcategory) or nextchar in self.quotes \
or self.whitespace_split:
self.token = self.token + nextchar
else:
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com