Steven, thanks! Very nice algorithm.
Here is code:

#!/usr/bin/env python
# -*- coding: utf_8 -*-

# Thanks Steven D'Aprano for hints

import unicodedata
import MySQLdb

#MySQL variables
mysql_host = "localhost"
mysql_user = "dict"
mysql_password = "passwd"
mysql_db = "dictionary"

try:
        mysql_conn = MySQLdb.connect(mysql_host, mysql_user, mysql_password,
mysql_db)
        cur = mysql_conn.cursor()
        cur.execute("""SET NAMES UTF8""")
except:
        print "unable insert to MySQL, check connection"

jap_text = "BZツーリTVツキDVD?"
jap_text = unicode(jap_text, 'utf-8')                             # fight with
full-width, half-width katakana madness :-)
jap_text = unicodedata.normalize('NFKC', jap_text)      #
jap_text = jap_text.encode('utf-8')                               #

def translate_hieroglyph(jap_text):
        eng_text = ""
        mysql_translate_query = "SELECT Eng FROM dictionary where Jis='%s'
collate utf8_unicode_ci LIMIT 1" % jap_text
        cur.execute(mysql_translate_query)
        mysql_trans_data = cur.fetchall()
        for line in mysql_trans_data:
                eng_text = line[0]
        if not eng_text:
                eng_text = jap_text
        return eng_text

def islatin(s):
    try:
        unicode(s, 'ascii')
    except UnicodeError:
        pass
    else:
        return True

def split_fragments(s):
    fragments = []
    latin = []
    nonlatin = []
    for c in s:
        if islatin(c):
            if nonlatin:
                fragments.append(''.join(nonlatin))
                nonlatin = []
            latin.append(c)
        else:
            if latin:
                fragments.append(''.join(latin))
                latin = []
            nonlatin.append(c)
    if latin:                                              # without
this we lose last fragment
        fragments.append(''.join(latin))         #
    else:                                                 #
        fragments.append(''.join(nonlatin))     #
    return fragments

fragments = split_fragments(jap_text)

def join_fragments(fragments):
    accumulator = []
    for fragment in fragments:
        if islatin(fragment):
            accumulator.append(fragment)
        else:
            accumulator.append(translate_hieroglyph(fragment))
    return ' '.join(accumulator)

print join_fragments(fragments)


[EMAIL PROTECTED] ~/Src/Code $ python translate.py
BZ navigation TV display DVD?

Work as needed :-) Thanks again!

-- 
http://mail.python.org/mailman/listinfo/python-list

Reply via email to