Steven, thanks! Very nice algorithm. Here is code:
#!/usr/bin/env python # -*- coding: utf_8 -*- # Thanks Steven D'Aprano for hints import unicodedata import MySQLdb #MySQL variables mysql_host = "localhost" mysql_user = "dict" mysql_password = "passwd" mysql_db = "dictionary" try: mysql_conn = MySQLdb.connect(mysql_host, mysql_user, mysql_password, mysql_db) cur = mysql_conn.cursor() cur.execute("""SET NAMES UTF8""") except: print "unable insert to MySQL, check connection" jap_text = "BZツーリTVツキDVD?" jap_text = unicode(jap_text, 'utf-8') # fight with full-width, half-width katakana madness :-) jap_text = unicodedata.normalize('NFKC', jap_text) # jap_text = jap_text.encode('utf-8') # def translate_hieroglyph(jap_text): eng_text = "" mysql_translate_query = "SELECT Eng FROM dictionary where Jis='%s' collate utf8_unicode_ci LIMIT 1" % jap_text cur.execute(mysql_translate_query) mysql_trans_data = cur.fetchall() for line in mysql_trans_data: eng_text = line[0] if not eng_text: eng_text = jap_text return eng_text def islatin(s): try: unicode(s, 'ascii') except UnicodeError: pass else: return True def split_fragments(s): fragments = [] latin = [] nonlatin = [] for c in s: if islatin(c): if nonlatin: fragments.append(''.join(nonlatin)) nonlatin = [] latin.append(c) else: if latin: fragments.append(''.join(latin)) latin = [] nonlatin.append(c) if latin: # without this we lose last fragment fragments.append(''.join(latin)) # else: # fragments.append(''.join(nonlatin)) # return fragments fragments = split_fragments(jap_text) def join_fragments(fragments): accumulator = [] for fragment in fragments: if islatin(fragment): accumulator.append(fragment) else: accumulator.append(translate_hieroglyph(fragment)) return ' '.join(accumulator) print join_fragments(fragments) [EMAIL PROTECTED] ~/Src/Code $ python translate.py BZ navigation TV display DVD? Work as needed :-) Thanks again! -- http://mail.python.org/mailman/listinfo/python-list