Hallöchen! Torsten Bronger writes:
> I'd like to map general unicode strings to safe filename. I tried > punycode but it is case-sensitive, which Windows is not. Thus, > "Hallo" and "hallo" are mapped to "Hallo-" and "hallo-", however, > I need uppercase Latin letters being encoded, too, and the > encoding must contain only lowercase Latin letters, numbers, > underscores, and maybe a little bit more. The result should be > more legible than base64, though. > > Has anybody created such a codec already? Okay, the following works fine for me: --8<---------------cut here---------------start------------->8--- import codecs class Codec(codecs.Codec): """Codec class for safe filenames. Safe filenames work on all important filesystems, i.e., they don't contain special or dangerous characters, and they don't assume that filenames are treated case-sensitively. >>> u"hallo".encode("safefilename") 'hallo' >>> u"Hallo".encode("safefilename") '(h)allo' >>> u"MIT Thesis".encode("safefilename") '(mit)_(t)hesis' >>> u"Gesch\\u00e4ftsbrief".encode("safefilename") '(g)esch{e4}ftsbrief' Of course, the mapping works in both directions as expected: >>> "(g)esch{e4}ftsbrief".decode("safefilename") u'Gesch\\xe4ftsbrief' >>> "(mit)_(t)hesis".decode("safefilename") u'MIT Thesis' """ lowercase_letters = "abcdefghijklmnopqrstuvwxyz" safe_characters = lowercase_letters + "0123456789-+!$%&`'@~#.,^" uppercase_letters = lowercase_letters.upper() def encode(self, input, errors='strict'): """Convert Unicode strings to safe filenames.""" output = "" i = 0 input_length = len(input) while i < input_length: c = input[i] if c in self.safe_characters: output += str(c) elif c == " ": output += "_" elif c in self.uppercase_letters: output += "(" while i < input_length and input[i] in self.uppercase_letters: output += str(input[i]).lower() i += 1 output += ")" continue else: output += "{" + hex(ord(c))[2:] + "}" i += 1 return output, input_length def handle_problematic_characters(self, errors, input, start, end, message): if errors == 'ignore': return u"" elif errors == 'replace': return u"?" else: raise UnicodeDecodeError("safefilename", input, start, end, message) def decode(self, input, errors='strict'): """Convert safe filenames to Unicode strings.""" input = str(input) input_length = len(input) output = u"" i = 0 while i < input_length: c = input[i] if c in self.safe_characters: output += c elif c == "_": output += " " elif c == "(": i += 1 while i < input_length and input[i] in self.lowercase_letters: output += input[i].upper() i += 1 if i == input_length: self.handle_problematic_characters(errors, input, i-1, i, "open parenthesis was never closed") continue if input[i] != ')': self.handle_problematic_characters( errors, input, i, i+1, "invalid character '%s' in parentheses sequence" % input[i]) continue elif c == "{": end_position = input.find("}", i) if end_position == -1: end_position = i+1 while end_position < input_length and input[end_position] in "0123456789abcdef" and \ end_position - i <= 8: end_position += 1 output += self.handle_problematic_characters(errors, input, i, end_position, "open backet was never closed") i = end_position continue else: try: output += unichr(int(input[i+1:end_position], 16)) except: output += self.handle_problematic_characters(errors, input, i, end_position+1, "invalid data between brackets") i = end_position else: output += self.handle_problematic_characters(errors, input, i, i+1, "invalid character '%s'" % c) i += 1 return output, input_length class StreamWriter(Codec, codecs.StreamWriter): pass class StreamReader(Codec, codecs.StreamReader): pass def _registry(encoding): if encoding == "safefilename": return (Codec().encode, Codec().decode, StreamReader, StreamWriter) else: return None codecs.register(_registry) if __name__ == "__main__": import doctest doctest.testmod() --8<---------------cut here---------------end--------------->8--- -- Torsten Bronger, aquisgrana, europa vetus Jabber ID: [EMAIL PROTECTED] (See http://ime.webhop.org for ICQ, MSN, etc.) -- http://mail.python.org/mailman/listinfo/python-list