[issue1170] shlex have problems with parsing unicode

Colin Walters Sat, 22 Dec 2007 10:22:27 -0800

Colin Walters added the comment:

Patch to add Unicode support.


Note: this patch recodes shlex.py from iso-8859-1 to utf-8, so it has
mixed encodings.

----------
nosy: +cgwalters
Added file: http://bugs.python.org/file9025/shlex-unicode.patch

__________________________________
Tracker <[EMAIL PROTECTED]>
<http://bugs.python.org/issue1170>
__________________________________

--- /usr/lib64/python2.5/shlex.py	2007-10-30 13:45:31.000000000 -0400
+++ hotwire/shlex.py	2007-12-22 13:05:52.000000000 -0500
@@ -1,4 +1,4 @@
-# -*- coding: iso-8859-1 -*-
+# -*- coding: utf-8 -*-
 """A lexical analyzer class for simple shell-like syntaxes."""
 
 # Module and documentation by Eric S. Raymond, 21 Dec 1998
@@ -6,9 +6,11 @@
 # push_source() and pop_source() made explicit by ESR, January 2001.
 # Posix compliance, split(), string arguments, and
 # iterator interface by Gustavo Niemeyer, April 2003.
+# Modified to support Unicode by Colin Walters, Dec 2007
 
 import os.path
 import sys
+import unicodedata
 from collections import deque
 
 try:
@@ -20,7 +22,7 @@
 
 class shlex:
     "A lexical analyzer class for simple shell-like syntaxes."
-    def __init__(self, instream=None, infile=None, posix=False):
+    def __init__(self, instream=None, infile=None, posix=False, utf=True):
         if isinstance(instream, basestring):
             instream = StringIO(instream)
         if instream is not None:
@@ -34,13 +36,21 @@
             self.eof = None
         else:
             self.eof = ''
+        self.utf = utf
         self.commenters = '#'
         self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
                           'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
-        if self.posix:
-            self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
-                               'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
-        self.whitespace = ' \t\r\n'
+        if self.posix and not self.utf:
+            self.wordchars += ('ÃÃ Ã¡Ã¢Ã£Ã¤Ã¥Ã¦Ã§Ã¨Ã©ÃªÃ«Ã¬ÃÃ®Ã¯Ã°Ã±Ã²Ã³Ã´ÃµÃ¶Ã¸Ã¹ÃºÃ»Ã¼Ã½Ã¾Ã¿'
+                               'ÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃ')
+        elif self.posix:
+            # We dynamically determine character classes below, except
+            # by default _ is a word character
+            self.wordchars = '_'
+        if not self.utf:
+            self.whitespace = ' \t\r\n'
+        else:
+            self.whitespace = ''
         self.whitespace_split = False
         self.quotes = '\'"'
         self.escape = '\\'
@@ -116,12 +126,22 @@
             else:
                 print "shlex: token=EOF"
         return raw
+    
+    def __is_whitespace(self, c, category):
+        return c in self.whitespace or (self.utf and category[0] == 'Z')        
+    
+    def __is_wordchar(self, c, category):
+        return c in self.wordchars or (self.utf and category[0] in ('L', 'N'))        
 
     def read_token(self):
         quoted = False
         escapedstate = ' '
         while True:
             nextchar = self.instream.read(1)
+            if nextchar and self.utf:
+                nextcategory = unicodedata.category(nextchar)
+            else:
+                nextcategory = None            
             if nextchar == '\n':
                 self.lineno = self.lineno + 1
             if self.debug >= 3:
@@ -134,7 +154,7 @@
                 if not nextchar:
                     self.state = None  # end of file
                     break
-                elif nextchar in self.whitespace:
+                if self.__is_whitespace(nextchar, nextcategory):
                     if self.debug >= 2:
                         print "shlex: I see whitespace in whitespace state"
                     if self.token or (self.posix and quoted):
@@ -147,7 +167,7 @@
                 elif self.posix and nextchar in self.escape:
                     escapedstate = 'a'
                     self.state = nextchar
-                elif nextchar in self.wordchars:
+                elif self.__is_wordchar(nextchar, nextcategory):
                     self.token = nextchar
                     self.state = 'a'
                 elif nextchar in self.quotes:
@@ -199,8 +219,8 @@
             elif self.state == 'a':
                 if not nextchar:
                     self.state = None   # end of file
-                    break
-                elif nextchar in self.whitespace:
+                    break                
+                if self.__is_whitespace(nextchar, nextcategory):
                     if self.debug >= 2:
                         print "shlex: I see whitespace in word state"
                     self.state = ' '
@@ -222,7 +242,7 @@
                 elif self.posix and nextchar in self.escape:
                     escapedstate = 'a'
                     self.state = nextchar
-                elif nextchar in self.wordchars or nextchar in self.quotes \
+                elif self.__is_wordchar(nextchar, nextcategory) or nextchar in self.quotes \
                     or self.whitespace_split:
                     self.token = self.token + nextchar
                 else:

_______________________________________________
Python-bugs-list mailing list 
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

[issue1170] shlex have problems with parsing unicode

Reply via email to