split string with hieroglyphs

2006-12-23 Thread Belize
Hi.
Essence of problem in the following:
Here is lines in utf8 of this form "BZ?ツーリTV%ツキDVD"
Is it possible to split them into the fragments that contain only latin
printable symbols (aplhabet + "?#" etc)
and fragments with the hieroglyphs, so it could be like this
['BZ?', '\xe3\x83\x84\xe3\x83\xbc\xe3\x83\xaa', 'TV%',
'\xe3\x83\x84\xe3\x82\xad', 'DVD'] ?
Then, after translate of hieroglyphs, necessary to join line, so it
could be like this 
"BZ? navigation TV% display DVD"
Thanks.

-- 
http://mail.python.org/mailman/listinfo/python-list

Re: split string with hieroglyphs

2006-12-24 Thread Belize
Steven, thanks! Very nice algorithm.
Here is code:


#!/usr/bin/env python
# -*- coding: utf_8 -*-

# Thanks Steven D'Aprano for hints

import unicodedata
import MySQLdb

#MySQL variables
mysql_host = "localhost"
mysql_user = "dict"
mysql_password = "passwd"
mysql_db = "dictionary"

try:
mysql_conn = MySQLdb.connect(mysql_host, mysql_user, mysql_password,
mysql_db)
cur = mysql_conn.cursor()
cur.execute("""SET NAMES UTF8""")
except:
print "unable insert to MySQL, check connection"

jap_text = "BZツーリTVツキDVD?"
jap_text = unicode(jap_text, 'utf-8') # fight with
full-width, half-width katakana madness :-)
jap_text = unicodedata.normalize('NFKC', jap_text)  #
jap_text = jap_text.encode('utf-8')   #

def translate_hieroglyph(jap_text):
eng_text = ""
mysql_translate_query = "SELECT Eng FROM dictionary where Jis='%s'
collate utf8_unicode_ci LIMIT 1" % jap_text
cur.execute(mysql_translate_query)
mysql_trans_data = cur.fetchall()
for line in mysql_trans_data:
eng_text = line[0]
if not eng_text:
eng_text = jap_text
return eng_text

def islatin(s):
try:
unicode(s, 'ascii')
except UnicodeError:
pass
else:
return True

def split_fragments(s):
fragments = []
latin = []
nonlatin = []
for c in s:
if islatin(c):
if nonlatin:
fragments.append(''.join(nonlatin))
nonlatin = []
latin.append(c)
else:
if latin:
fragments.append(''.join(latin))
latin = []
nonlatin.append(c)
if latin:  # without
this we lose last fragment
fragments.append(''.join(latin)) #
else: #
fragments.append(''.join(nonlatin)) #
return fragments

fragments = split_fragments(jap_text)

def join_fragments(fragments):
accumulator = []
for fragment in fragments:
if islatin(fragment):
accumulator.append(fragment)
else:
accumulator.append(translate_hieroglyph(fragment))
return ' '.join(accumulator)

print join_fragments(fragments)


[EMAIL PROTECTED] ~/Src/Code $ python translate.py
BZ navigation TV display DVD?

Work as needed :-) Thanks again!

-- 
http://mail.python.org/mailman/listinfo/python-list